• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2012 The Libphonenumber Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: Patrick Mezard
16 
17 #include "cpp-build/generate_geocoding_data.h"
18 
19 #include <dirent.h>
20 #include <errno.h>
21 #include <locale>
22 #include <sys/stat.h>
23 #include <algorithm>
24 #include <cctype>
25 #include <cmath>
26 #include <cstdio>
27 #include <cstring>
28 #include <iomanip>
29 #include <iterator>
30 #include <map>
31 #include <set>
32 #include <sstream>
33 #include <string>
34 #include <utility>
35 #include <vector>
36 
37 #include "base/basictypes.h"
38 
39 namespace i18n {
40 namespace phonenumbers {
41 
42 using std::map;
43 using std::string;
44 using std::vector;
45 using std::set;
46 using std::pair;
47 
48 template <typename ResourceType> class AutoCloser {
49  public:
50   typedef int (*ReleaseFunction) (ResourceType* resource);
51 
AutoCloser(ResourceType ** resource,ReleaseFunction release_function)52   AutoCloser(ResourceType** resource, ReleaseFunction release_function)
53       : resource_(resource),
54         release_function_(release_function)
55   {}
56 
~AutoCloser()57   ~AutoCloser() {
58     Close();
59   }
60 
get_resource() const61   ResourceType* get_resource() const {
62     return *resource_;
63   }
64 
Close()65   void Close() {
66     if (*resource_) {
67       release_function_(*resource_);
68       *resource_ = NULL;
69     }
70   }
71 
72  private:
73   ResourceType** resource_;
74   ReleaseFunction release_function_;
75 };
76 
77 enum DirEntryKinds {
78   kFile = 0,
79   kDirectory = 1,
80 };
81 
82 class DirEntry {
83  public:
DirEntry(const char * n,DirEntryKinds k)84   DirEntry(const char* n, DirEntryKinds k)
85       : name_(n),
86         kind_(k)
87   {}
88 
name() const89   const std::string& name() const { return name_; }
kind() const90   DirEntryKinds kind() const { return kind_; }
91 
92  private:
93   std::string name_;
94   DirEntryKinds kind_;
95 };
96 
97 // Lists directory entries in path. "." and ".." are excluded. Returns true on
98 // success.
ListDirectory(const string & path,vector<DirEntry> * entries)99 bool ListDirectory(const string& path, vector<DirEntry>* entries) {
100   entries->clear();
101   DIR* dir = opendir(path.c_str());
102   if (!dir) {
103     return false;
104   }
105   AutoCloser<DIR> dir_closer(&dir, closedir);
106   struct dirent *entry;
107   struct stat entry_stat;
108   while (true) {
109     // Set errno to 0 to be able to check if an error occurs during the
110     // readdir() call. NULL is the return value when the end of the directory
111     // stream is reached or when an error occurs, and the errno check is the
112     // only thing that helps us distinguish between the two cases. See
113     // documentation at
114     // http://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html
115     errno = 0;
116     entry = readdir(dir);
117     if (entry == NULL) {
118       return errno == 0;
119     }
120     if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
121        continue;
122     }
123     const string entry_path = path + "/" + entry->d_name;
124     if (stat(entry_path.c_str(), &entry_stat)) {
125       return false;
126     }
127     DirEntryKinds kind = kFile;
128     if (S_ISDIR(entry_stat.st_mode)) {
129       kind = kDirectory;
130     } else if (!S_ISREG(entry_stat.st_mode)) {
131       continue;
132     }
133     entries->push_back(DirEntry(entry->d_name, kind));
134   }
135 }
136 
137 // Returns true if s ends with suffix.
EndsWith(const string & s,const string & suffix)138 bool EndsWith(const string& s, const string& suffix) {
139   if (suffix.length() > s.length()) {
140     return false;
141   }
142   return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
143 }
144 
145 // Converts string to integer, returns true on success.
StrToInt(const string & s,int32 * n)146 bool StrToInt(const string& s, int32* n) {
147   std::stringstream stream;
148   stream << s;
149   stream >> *n;
150   return !stream.fail();
151 }
152 
153 // Converts integer to string, returns true on success.
IntToStr(int32 n,string * s)154 bool IntToStr(int32 n, string* s) {
155   std::stringstream stream;
156   stream << n;
157   stream >> *s;
158   return !stream.fail();
159 }
160 
161 // Parses the prefix descriptions file at path, clears and fills the output
162 // prefixes phone number prefix to description mapping.
163 // Returns true on success.
ParsePrefixes(const string & path,map<int32,string> * prefixes)164 bool ParsePrefixes(const string& path, map<int32, string>* prefixes) {
165   prefixes->clear();
166   FILE* input = fopen(path.c_str(), "r");
167   if (!input) {
168     return false;
169   }
170   AutoCloser<FILE> input_closer(&input, fclose);
171   const int kMaxLineLength = 2*1024;
172   vector<char> buffer(kMaxLineLength);
173   vector<char>::iterator begin, end, sep;
174   string prefix, description;
175   int32 prefix_code;
176   while (fgets(&buffer[0], buffer.size(), input)) {
177     begin = buffer.begin();
178     end = std::find(begin, buffer.end(), '\0');
179     if (end == begin) {
180       continue;
181     }
182     --end;
183     if (*end != '\n' && !feof(input)) {
184       // A line without LF can only happen at the end of file.
185       return false;
186     }
187 
188     // Trim and check for comments.
189     for (; begin != end && std::isspace(*begin); ++begin) {}
190     for (; end != begin && std::isspace(*(end - 1)); --end) {}
191     if (begin == end || *begin == '#') {
192       continue;
193     }
194 
195     sep = std::find(begin, end, '|');
196     if (sep == end) {
197       continue;
198     }
199     prefix = string(begin, sep);
200     if (!StrToInt(prefix, &prefix_code)) {
201       return false;
202     }
203     (*prefixes)[prefix_code] = string(sep + 1, end);
204   }
205   return ferror(input) == 0;
206 }
207 
208 // Builds a C string literal from s. The output is enclosed in double-quotes and
209 // care is taken to escape input quotes and non-ASCII or control characters.
210 //
211 // An input string:
212 //   Op\xc3\xa9ra
213 // becomes:
214 //   "Op""\xc3""\xa9""ra"
MakeStringLiteral(const string & s)215 string MakeStringLiteral(const string& s) {
216   std::stringstream buffer;
217   int prev_is_hex = 0;
218   buffer << std::hex << std::setfill('0');
219   buffer << "\"";
220   for (string::const_iterator it = s.begin(); it != s.end(); ++it) {
221     const char c = *it;
222     if (c >= 32 && c < 127) {
223       if (prev_is_hex == 2) {
224         buffer << "\"\"";
225       }
226       if (c == '\'') {
227         buffer << "\\";
228       }
229       buffer << c;
230       prev_is_hex = 1;
231     } else {
232       if (prev_is_hex != 0) {
233         buffer << "\"\"";
234       }
235       buffer << "\\x" << std::setw(2) << (c < 0 ? c + 256 : c);
236       prev_is_hex = 2;
237     }
238   }
239   buffer << "\"";
240   return buffer.str();
241 }
242 
WriteStringLiteral(const string & s,FILE * output)243 void WriteStringLiteral(const string& s, FILE* output) {
244   string literal = MakeStringLiteral(s);
245   fprintf(output, "%s", literal.c_str());
246 }
247 
248 const char kLicense[] =
249   "// Copyright (C) 2012 The Libphonenumber Authors\n"
250   "//\n"
251   "// Licensed under the Apache License, Version 2.0 (the \"License\");\n"
252   "// you may not use this file except in compliance with the License.\n"
253   "// You may obtain a copy of the License at\n"
254   "//\n"
255   "// http://www.apache.org/licenses/LICENSE-2.0\n"
256   "//\n"
257   "// Unless required by applicable law or agreed to in writing, software\n"
258   "// distributed under the License is distributed on an \"AS IS\" BASIS,\n"
259   "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or "
260   "implied.\n"
261   "// See the License for the specific language governing permissions and\n"
262   "// limitations under the License.\n"
263   "//\n"
264   "// This file is generated automatically, do not edit it manually.\n"
265   "\n";
266 
WriteLicense(FILE * output)267 void WriteLicense(FILE* output) {
268   fprintf(output, "%s", kLicense);
269 }
270 
271 const char kI18NNS[] = "i18n";
272 const char kPhoneNumbersNS[] = "phonenumbers";
273 
WriteNSHeader(FILE * output)274 void WriteNSHeader(FILE* output) {
275   fprintf(output, "namespace %s {\n", kI18NNS);
276   fprintf(output, "namespace %s {\n", kPhoneNumbersNS);
277 }
278 
WriteNSFooter(FILE * output)279 void WriteNSFooter(FILE* output) {
280   fprintf(output, "}  // namespace %s\n", kPhoneNumbersNS);
281   fprintf(output, "}  // namespace %s\n", kI18NNS);
282 }
283 
WriteCppHeader(const string & base_name,FILE * output)284 void WriteCppHeader(const string& base_name, FILE* output) {
285   fprintf(output, "#include \"phonenumbers/geocoding/%s.h\"\n",
286           base_name.c_str());
287   fprintf(output, "\n");
288   fprintf(output, "#include \"phonenumbers/base/basictypes.h\"\n");
289   fprintf(output, "\n");
290 }
291 
WriteArrayAndSize(const string & name,FILE * output)292 void WriteArrayAndSize(const string& name, FILE* output) {
293   fprintf(output, "  %s,\n", name.c_str());
294   fprintf(output, "  sizeof(%s)/sizeof(*%s),\n", name.c_str(), name.c_str());
295 }
296 
297 // Writes a PrefixDescriptions variable named "name", with its prefixes field
298 // set to "prefixes_name" variable, its descriptions to "desc_name" and its
299 // possible_lengths to "possible_lengths_name":
300 //
301 // const PrefixDescriptions ${name} = {
302 //   ${prefix_name},
303 //   sizeof(${prefix_name})/sizeof(*${prefix_name}),
304 //   ${desc_name},
305 //   ${possible_lengths_name},
306 //   sizeof(${possible_lengths_name})/sizeof(*${possible_lengths_name}),
307 // };
308 //
WritePrefixDescriptionsDefinition(const string & name,const string & prefixes_name,const string & desc_name,const string & possible_lengths_name,FILE * output)309 void WritePrefixDescriptionsDefinition(
310     const string& name, const string& prefixes_name, const string& desc_name,
311     const string& possible_lengths_name, FILE* output) {
312   fprintf(output, "const PrefixDescriptions %s = {\n", name.c_str());
313   WriteArrayAndSize(prefixes_name, output);
314   fprintf(output, "  %s,\n", desc_name.c_str());
315   WriteArrayAndSize(possible_lengths_name, output);
316   fprintf(output, "};\n");
317 }
318 
319 // Writes prefixes, descriptions and possible_lengths arrays built from the
320 // phone number prefix to description mapping "prefixes". Binds these arrays
321 // in a single PrefixDescriptions variable named "var_name".
322 //
323 // const int32 ${var_name}_prefixes[] = {
324 //   1201,
325 //   1650,
326 // };
327 //
328 // const char* ${var_name}_descriptions[] = {
329 //   "New Jerse",
330 //   "Kalifornie",
331 // };
332 //
333 // const int32 ${var_name}_possible_lengths[] = {
334 //   4,
335 // };
336 //
337 // const PrefixDescriptions ${var_name} = {
338 //   ...
339 // };
340 //
WritePrefixDescriptions(const string & var_name,const map<int,string> & prefixes,FILE * output)341 void WritePrefixDescriptions(const string& var_name, const map<int, string>&
342                              prefixes, FILE* output) {
343   set<int> possible_lengths;
344   const string prefixes_name = var_name + "_prefixes";
345   fprintf(output, "const int32 %s[] = {\n", prefixes_name.c_str());
346   for (map<int, string>::const_iterator it = prefixes.begin();
347        it != prefixes.end(); ++it) {
348     fprintf(output, "  %d,\n", it->first);
349     possible_lengths.insert(static_cast<int>(log10(it->first) + 1));
350   }
351   fprintf(output,
352           "};\n"
353           "\n");
354 
355   const string desc_name = var_name + "_descriptions";
356   fprintf(output, "const char* %s[] = {\n", desc_name.c_str());
357   for (map<int, string>::const_iterator it = prefixes.begin();
358        it != prefixes.end(); ++it) {
359     fprintf(output, "  ");
360     WriteStringLiteral(it->second, output);
361     fprintf(output, ",\n");
362   }
363   fprintf(output,
364           "};\n"
365           "\n");
366 
367   const string possible_lengths_name = var_name + "_possible_lengths";
368   fprintf(output, "const int32 %s[] = {\n ", possible_lengths_name.c_str());
369   for (set<int>::const_iterator it = possible_lengths.begin();
370        it != possible_lengths.end(); ++it) {
371     fprintf(output, " %d,", *it);
372   }
373   fprintf(output,
374           "\n"
375           "};\n"
376           "\n");
377 
378   WritePrefixDescriptionsDefinition(var_name, prefixes_name, desc_name,
379                                     possible_lengths_name, output);
380   fprintf(output, "\n");
381 }
382 
383 // Writes a pair of arrays mapping prefix language code pairs to
384 // PrefixDescriptions instances. "prefix_var_names" maps language code pairs
385 // to prefix variable names.
386 //
387 // const char* prefix_language_code_pairs[] = {
388 //   "1_de",
389 //   "1_en",
390 // };
391 //
392 // const PrefixDescriptions* prefix_descriptions[] = {
393 //   &prefix_1_de,
394 //   &prefix_1_en,
395 // };
396 //
WritePrefixesDescriptions(const map<string,string> & prefix_var_names,FILE * output)397 void WritePrefixesDescriptions(const map<string, string>& prefix_var_names,
398                                FILE* output) {
399   fprintf(output, "const char* prefix_language_code_pairs[] = {\n");
400   for (map<string, string>::const_iterator it = prefix_var_names.begin();
401        it != prefix_var_names.end(); ++it) {
402     fprintf(output, "  \"%s\",\n", it->first.c_str());
403   }
404   fprintf(output,
405           "};\n"
406           "\n"
407           "const PrefixDescriptions* prefixes_descriptions[] = {\n");
408   for (map<string, string>::const_iterator it = prefix_var_names.begin();
409        it != prefix_var_names.end(); ++it) {
410     fprintf(output, "  &%s,\n", it->second.c_str());
411   }
412   fprintf(output,
413           "};\n"
414           "\n");
415 }
416 
417 // For each entry in "languages" mapping a country calling code to a set
418 // of available languages, writes a sorted array of languages, then wraps it
419 // into a CountryLanguages instance. Finally, writes a pair of arrays mapping
420 // country calling codes to CountryLanguages instances.
421 //
422 // const char* country_1[] = {
423 //   "de",
424 //   "en",
425 // };
426 //
427 // const CountryLanguages country_1_languages = {
428 //   country_1,
429 //   sizeof(country_1)/sizeof(*country_1),
430 // };
431 //
432 // [...]
433 //
434 // const CountryLanguages* country_languages[] = {
435 //   &country_1_languages,
436 //   [...]
437 // }
438 //
439 // const int country_calling_codes[] = {
440 //   1,
441 //   [...]
442 // };
443 //
WriteCountryLanguages(const map<int32,set<string>> & languages,FILE * output)444 bool WriteCountryLanguages(const map<int32, set<string> >& languages,
445                            FILE* output) {
446   vector<string> country_languages_vars;
447   vector<string> countries;
448   for (map<int32, set<string> >::const_iterator it = languages.begin();
449        it != languages.end(); ++it) {
450     string country_code;
451     if (!IntToStr(it->first, &country_code)) {
452       return false;
453     }
454     const string country_var = "country_" + country_code;
455     fprintf(output, "const char* %s[] = {\n", country_var.c_str());
456     for (set<string>::const_iterator it_lang = it->second.begin();
457          it_lang != it->second.end(); ++it_lang) {
458       fprintf(output, "  \"%s\",\n", it_lang->c_str());
459     }
460     fprintf(output,
461             "};\n"
462             "\n");
463 
464     const string country_languages_var = country_var + "_languages";
465     fprintf(output, "const CountryLanguages %s = {\n",
466             country_languages_var.c_str());
467     WriteArrayAndSize(country_var, output);
468     fprintf(output,
469             "};\n"
470             "\n");
471     country_languages_vars.push_back(country_languages_var);
472     countries.push_back(country_code);
473   }
474 
475   fprintf(output,
476           "\n"
477           "const CountryLanguages* countries_languages[] = {\n");
478   for (vector<string>::const_iterator
479        it_languages_var = country_languages_vars.begin();
480        it_languages_var != country_languages_vars.end(); ++it_languages_var) {
481     fprintf(output, "  &%s,\n", it_languages_var->c_str());
482   }
483   fprintf(output,
484           "};\n"
485           "\n"
486           "const int country_calling_codes[] = {\n");
487   for (vector<string>::const_iterator it_country = countries.begin();
488        it_country != countries.end(); ++it_country) {
489     fprintf(output, "  %s,\n", it_country->c_str());
490   }
491   fprintf(output,
492           "};\n"
493           "\n");
494   return true;
495 }
496 
497 // Returns a copy of input where all occurences of pattern are replaced with
498 // value. If pattern is empty, input is returned unchanged.
ReplaceAll(const string & input,const string & pattern,const string & value)499 string ReplaceAll(const string& input, const string& pattern,
500                   const string& value) {
501   if (pattern.size() == 0) {
502     return input;
503   }
504   string replaced;
505   std::back_insert_iterator<string> output = std::back_inserter(replaced);
506   string::const_iterator begin = input.begin(), end = begin;
507   while (true) {
508     const size_t pos = input.find(pattern, begin - input.begin());
509     if (pos == string::npos) {
510       std::copy(begin, input.end(), output);
511       break;
512     }
513     end = input.begin() + pos;
514     std::copy(begin, end, output);
515     std::copy(value.begin(), value.end(), output);
516     begin = end + pattern.length();
517   }
518   return replaced;
519 }
520 
521 // Writes data accessor definitions, prefixed with "accessor_prefix".
WriteAccessorsDefinitions(const string & accessor_prefix,FILE * output)522 void WriteAccessorsDefinitions(const string& accessor_prefix, FILE* output) {
523   string templ =
524       "const int* get$prefix$_country_calling_codes() {\n"
525       "  return country_calling_codes;\n"
526       "}\n"
527       "\n"
528       "int get$prefix$_country_calling_codes_size() {\n"
529       "  return sizeof(country_calling_codes)\n"
530       "      /sizeof(*country_calling_codes);\n"
531       "}\n"
532       "\n"
533       "const CountryLanguages* get$prefix$_country_languages(int index) {\n"
534       "  return countries_languages[index];\n"
535       "}\n"
536       "\n"
537       "const char** get$prefix$_prefix_language_code_pairs() {\n"
538       "  return prefix_language_code_pairs;\n"
539       "}\n"
540       "\n"
541       "int get$prefix$_prefix_language_code_pairs_size() {\n"
542       "  return sizeof(prefix_language_code_pairs)\n"
543       "      /sizeof(*prefix_language_code_pairs);\n"
544       "}\n"
545       "\n"
546       "const PrefixDescriptions* get$prefix$_prefix_descriptions(int index) {\n"
547       "  return prefixes_descriptions[index];\n"
548       "}\n";
549   string defs = ReplaceAll(templ, "$prefix$", accessor_prefix);
550   fprintf(output, "%s", defs.c_str());
551 }
552 
553 // Writes geocoding data .cc file. "data_path" is the path of geocoding textual
554 // data directory. "base_name" is the base name of the .h/.cc pair, like
555 // "geocoding_data".
WriteSource(const string & data_path,const string & base_name,const string & accessor_prefix,FILE * output)556 bool WriteSource(const string& data_path, const string& base_name,
557                  const string& accessor_prefix, FILE* output) {
558   WriteLicense(output);
559   WriteCppHeader(base_name, output);
560   WriteNSHeader(output);
561   fprintf(output,
562           "namespace {\n"
563           "\n");
564 
565   // Enumerate language/script directories.
566   map<string, string> prefix_vars;
567   map<int32, set<string> > country_languages;
568   vector<DirEntry> entries;
569   if (!ListDirectory(data_path, &entries)) {
570     fprintf(stderr, "failed to read directory entries");
571     return false;
572   }
573   for (vector<DirEntry>::const_iterator it = entries.begin();
574        it != entries.end(); ++it) {
575     if (it->kind() != kDirectory) {
576       continue;
577     }
578     // Enumerate country calling code files.
579     const string dir_path = data_path + "/" + it->name();
580     vector<DirEntry> files;
581     if (!ListDirectory(dir_path, &files)) {
582       fprintf(stderr, "failed to read file entries\n");
583       return false;
584     }
585     for (vector<DirEntry>::const_iterator it_files = files.begin();
586          it_files != files.end(); ++it_files) {
587       const string fname = it_files->name();
588       if (!EndsWith(fname, ".txt")) {
589        continue;
590       }
591       int32 country_code;
592       const string country_code_str = fname.substr(0, fname.length() - 4);
593       if (!StrToInt(country_code_str, &country_code)) {
594         return false;
595       }
596       const string path = dir_path + "/" + fname;
597 
598       map<int32, string> prefixes;
599       if (!ParsePrefixes(path, &prefixes)) {
600         return false;
601       }
602 
603       const string prefix_var = "prefix_" + country_code_str + "_" + it->name();
604       WritePrefixDescriptions(prefix_var, prefixes, output);
605       prefix_vars[country_code_str + "_" + it->name()] = prefix_var;
606       country_languages[country_code].insert(it->name());
607     }
608   }
609   WritePrefixesDescriptions(prefix_vars, output);
610   if (!WriteCountryLanguages(country_languages, output)) {
611     return false;
612   }
613   fprintf(output, "}  // namespace\n");
614   fprintf(output, "\n");
615   WriteAccessorsDefinitions(accessor_prefix, output);
616   WriteNSFooter(output);
617   return ferror(output) == 0;
618 }
619 
PrintHelp(const string & message)620 int PrintHelp(const string& message) {
621   fprintf(stderr, "error: %s\n", message.c_str());
622   fprintf(stderr, "generate_geocoding_data DATADIR CCPATH");
623   return 1;
624 }
625 
Main(int argc,const char * argv[])626 int Main(int argc, const char* argv[]) {
627   if (argc < 2) {
628     return PrintHelp("geocoding data root directory expected");
629   }
630   if (argc < 3) {
631     return PrintHelp("output source path expected");
632   }
633   string accessor_prefix = "";
634   if (argc > 3) {
635     accessor_prefix = argv[3];
636   }
637   const string root_path(argv[1]);
638   string source_path(argv[2]);
639   std::replace(source_path.begin(), source_path.end(), '\\', '/');
640   string base_name = source_path;
641   if (base_name.rfind('/') != string::npos) {
642     base_name = base_name.substr(base_name.rfind('/') + 1);
643   }
644   base_name = base_name.substr(0, base_name.rfind('.'));
645 
646   FILE* source_fp = fopen(source_path.c_str(), "w");
647   if (!source_fp) {
648     fprintf(stderr, "failed to open %s\n", source_path.c_str());
649     return 1;
650   }
651   AutoCloser<FILE> source_closer(&source_fp, fclose);
652   if (!WriteSource(root_path, base_name, accessor_prefix,
653                    source_fp)) {
654     return 1;
655   }
656   return 0;
657 }
658 
659 }  // namespace phonenumbers
660 }  // namespace i18n
661