1 // Copyright (C) 2012 The Libphonenumber Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: Patrick Mezard
16
17 #include "cpp-build/generate_geocoding_data.h"
18
19 #include <dirent.h>
20 #include <errno.h>
21 #include <locale>
22 #include <sys/stat.h>
23 #include <algorithm>
24 #include <cctype>
25 #include <cmath>
26 #include <cstdio>
27 #include <cstring>
28 #include <iomanip>
29 #include <iterator>
30 #include <map>
31 #include <set>
32 #include <sstream>
33 #include <string>
34 #include <utility>
35 #include <vector>
36
37 #include "base/basictypes.h"
38
39 #include "absl/container/btree_map.h"
40 #include "absl/container/btree_set.h"
41
42 namespace i18n {
43 namespace phonenumbers {
44
45 using std::map;
46 using std::string;
47 using std::vector;
48 using std::set;
49 using std::pair;
50
51 template <typename ResourceType> class AutoCloser {
52 public:
53 typedef int (*ReleaseFunction) (ResourceType* resource);
54
AutoCloser(ResourceType ** resource,ReleaseFunction release_function)55 AutoCloser(ResourceType** resource, ReleaseFunction release_function)
56 : resource_(resource),
57 release_function_(release_function)
58 {}
59
~AutoCloser()60 ~AutoCloser() {
61 Close();
62 }
63
get_resource() const64 ResourceType* get_resource() const {
65 return *resource_;
66 }
67
Close()68 void Close() {
69 if (*resource_) {
70 release_function_(*resource_);
71 *resource_ = NULL;
72 }
73 }
74
75 private:
76 ResourceType** resource_;
77 ReleaseFunction release_function_;
78 };
79
80 enum DirEntryKinds {
81 kFile = 0,
82 kDirectory = 1,
83 };
84
85 class DirEntry {
86 public:
DirEntry(const char * n,DirEntryKinds k)87 DirEntry(const char* n, DirEntryKinds k)
88 : name_(n),
89 kind_(k)
90 {}
91
name() const92 const std::string& name() const { return name_; }
kind() const93 DirEntryKinds kind() const { return kind_; }
94
95 private:
96 std::string name_;
97 DirEntryKinds kind_;
98 };
99
100 // Lists directory entries in path. "." and ".." are excluded. Returns true on
101 // success.
ListDirectory(const string & path,vector<DirEntry> * entries)102 bool ListDirectory(const string& path, vector<DirEntry>* entries) {
103 entries->clear();
104 DIR* dir = opendir(path.c_str());
105 if (!dir) {
106 return false;
107 }
108 AutoCloser<DIR> dir_closer(&dir, closedir);
109 struct dirent *entry;
110 struct stat entry_stat;
111 while (true) {
112 // Set errno to 0 to be able to check if an error occurs during the
113 // readdir() call. NULL is the return value when the end of the directory
114 // stream is reached or when an error occurs, and the errno check is the
115 // only thing that helps us distinguish between the two cases. See
116 // documentation at
117 // http://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html
118 errno = 0;
119 entry = readdir(dir);
120 if (entry == NULL) {
121 return errno == 0;
122 }
123 if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
124 continue;
125 }
126 const string entry_path = path + "/" + entry->d_name;
127 if (stat(entry_path.c_str(), &entry_stat)) {
128 return false;
129 }
130 DirEntryKinds kind = kFile;
131 if (S_ISDIR(entry_stat.st_mode)) {
132 kind = kDirectory;
133 } else if (!S_ISREG(entry_stat.st_mode)) {
134 continue;
135 }
136 entries->push_back(DirEntry(entry->d_name, kind));
137 }
138 }
139
140 // Returns true if s ends with suffix.
EndsWith(const string & s,const string & suffix)141 bool EndsWith(const string& s, const string& suffix) {
142 if (suffix.length() > s.length()) {
143 return false;
144 }
145 return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
146 }
147
148 // Converts string to integer, returns true on success.
StrToInt(const string & s,int32 * n)149 bool StrToInt(const string& s, int32* n) {
150 std::stringstream stream;
151 stream << s;
152 stream >> *n;
153 return !stream.fail();
154 }
155
156 // Converts integer to string, returns true on success.
IntToStr(int32 n,string * s)157 bool IntToStr(int32 n, string* s) {
158 std::stringstream stream;
159 stream << n;
160 stream >> *s;
161 return !stream.fail();
162 }
163
164 // Parses the prefix descriptions file at path, clears and fills the output
165 // prefixes phone number prefix to description mapping.
166 // Returns true on success.
ParsePrefixes(const string & path,absl::btree_map<int32,string> * prefixes)167 bool ParsePrefixes(const string& path,
168 absl::btree_map<int32, string>* prefixes) {
169 prefixes->clear();
170 FILE* input = fopen(path.c_str(), "r");
171 if (!input) {
172 return false;
173 }
174 AutoCloser<FILE> input_closer(&input, fclose);
175 const int kMaxLineLength = 2*1024;
176 vector<char> buffer(kMaxLineLength);
177 vector<char>::iterator begin, end, sep;
178 string prefix, description;
179 int32 prefix_code;
180 while (fgets(&buffer[0], buffer.size(), input)) {
181 begin = buffer.begin();
182 end = std::find(begin, buffer.end(), '\0');
183 if (end == begin) {
184 continue;
185 }
186 --end;
187 if (*end != '\n' && !feof(input)) {
188 // A line without LF can only happen at the end of file.
189 return false;
190 }
191
192 // Trim and check for comments.
193 for (; begin != end && std::isspace(*begin); ++begin) {}
194 for (; end != begin && std::isspace(*(end - 1)); --end) {}
195 if (begin == end || *begin == '#') {
196 continue;
197 }
198
199 sep = std::find(begin, end, '|');
200 if (sep == end) {
201 continue;
202 }
203 prefix = string(begin, sep);
204 if (!StrToInt(prefix, &prefix_code)) {
205 return false;
206 }
207 (*prefixes)[prefix_code] = string(sep + 1, end);
208 }
209 return ferror(input) == 0;
210 }
211
212 // Builds a C string literal from s. The output is enclosed in double-quotes and
213 // care is taken to escape input quotes and non-ASCII or control characters.
214 //
215 // An input string:
216 // Op\xc3\xa9ra
217 // becomes:
218 // "Op""\xc3""\xa9""ra"
MakeStringLiteral(const string & s)219 string MakeStringLiteral(const string& s) {
220 std::stringstream buffer;
221 int prev_is_hex = 0;
222 buffer << std::hex << std::setfill('0');
223 buffer << "\"";
224 for (string::const_iterator it = s.begin(); it != s.end(); ++it) {
225 const char c = *it;
226 if (c >= 32 && c < 127) {
227 if (prev_is_hex == 2) {
228 buffer << "\"\"";
229 }
230 if (c == '\'') {
231 buffer << "\\";
232 }
233 buffer << c;
234 prev_is_hex = 1;
235 } else {
236 if (prev_is_hex != 0) {
237 buffer << "\"\"";
238 }
239 buffer << "\\x" << std::setw(2) << (c < 0 ? c + 256 : c);
240 prev_is_hex = 2;
241 }
242 }
243 buffer << "\"";
244 return buffer.str();
245 }
246
WriteStringLiteral(const string & s,FILE * output)247 void WriteStringLiteral(const string& s, FILE* output) {
248 string literal = MakeStringLiteral(s);
249 fprintf(output, "%s", literal.c_str());
250 }
251
252 const char kLicense[] =
253 "// Copyright (C) 2012 The Libphonenumber Authors\n"
254 "//\n"
255 "// Licensed under the Apache License, Version 2.0 (the \"License\");\n"
256 "// you may not use this file except in compliance with the License.\n"
257 "// You may obtain a copy of the License at\n"
258 "//\n"
259 "// http://www.apache.org/licenses/LICENSE-2.0\n"
260 "//\n"
261 "// Unless required by applicable law or agreed to in writing, software\n"
262 "// distributed under the License is distributed on an \"AS IS\" BASIS,\n"
263 "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or "
264 "implied.\n"
265 "// See the License for the specific language governing permissions and\n"
266 "// limitations under the License.\n"
267 "//\n"
268 "// This file is generated automatically, do not edit it manually.\n"
269 "\n";
270
WriteLicense(FILE * output)271 void WriteLicense(FILE* output) {
272 fprintf(output, "%s", kLicense);
273 }
274
275 const char kI18NNS[] = "i18n";
276 const char kPhoneNumbersNS[] = "phonenumbers";
277
WriteNSHeader(FILE * output)278 void WriteNSHeader(FILE* output) {
279 fprintf(output, "namespace %s {\n", kI18NNS);
280 fprintf(output, "namespace %s {\n", kPhoneNumbersNS);
281 }
282
WriteNSFooter(FILE * output)283 void WriteNSFooter(FILE* output) {
284 fprintf(output, "} // namespace %s\n", kPhoneNumbersNS);
285 fprintf(output, "} // namespace %s\n", kI18NNS);
286 }
287
WriteCppHeader(const string & base_name,FILE * output)288 void WriteCppHeader(const string& base_name, FILE* output) {
289 fprintf(output, "#include \"phonenumbers/geocoding/%s.h\"\n",
290 base_name.c_str());
291 fprintf(output, "\n");
292 fprintf(output, "#include \"phonenumbers/base/basictypes.h\"\n");
293 fprintf(output, "\n");
294 }
295
WriteArrayAndSize(const string & name,FILE * output)296 void WriteArrayAndSize(const string& name, FILE* output) {
297 fprintf(output, " %s,\n", name.c_str());
298 fprintf(output, " sizeof(%s)/sizeof(*%s),\n", name.c_str(), name.c_str());
299 }
300
301 // Writes a PrefixDescriptions variable named "name", with its prefixes field
302 // set to "prefixes_name" variable, its descriptions to "desc_name" and its
303 // possible_lengths to "possible_lengths_name":
304 //
305 // const PrefixDescriptions ${name} = {
306 // ${prefix_name},
307 // sizeof(${prefix_name})/sizeof(*${prefix_name}),
308 // ${desc_name},
309 // ${possible_lengths_name},
310 // sizeof(${possible_lengths_name})/sizeof(*${possible_lengths_name}),
311 // };
312 //
WritePrefixDescriptionsDefinition(const string & name,const string & prefixes_name,const string & desc_name,const string & possible_lengths_name,FILE * output)313 void WritePrefixDescriptionsDefinition(
314 const string& name, const string& prefixes_name, const string& desc_name,
315 const string& possible_lengths_name, FILE* output) {
316 fprintf(output, "const PrefixDescriptions %s = {\n", name.c_str());
317 WriteArrayAndSize(prefixes_name, output);
318 fprintf(output, " %s,\n", desc_name.c_str());
319 WriteArrayAndSize(possible_lengths_name, output);
320 fprintf(output, "};\n");
321 }
322
323 // Writes prefixes, descriptions and possible_lengths arrays built from the
324 // phone number prefix to description mapping "prefixes". Binds these arrays
325 // in a single PrefixDescriptions variable named "var_name".
326 //
327 // const int32 ${var_name}_prefixes[] = {
328 // 1201,
329 // 1650,
330 // };
331 //
332 // const char* ${var_name}_descriptions[] = {
333 // "New Jerse",
334 // "Kalifornie",
335 // };
336 //
337 // const int32 ${var_name}_possible_lengths[] = {
338 // 4,
339 // };
340 //
341 // const PrefixDescriptions ${var_name} = {
342 // ...
343 // };
344 //
WritePrefixDescriptions(const string & var_name,const absl::btree_map<int,string> & prefixes,FILE * output)345 void WritePrefixDescriptions(const string& var_name,
346 const absl::btree_map<int, string>& prefixes,
347 FILE* output) {
348 absl::btree_set<int> possible_lengths;
349 const string prefixes_name = var_name + "_prefixes";
350 fprintf(output, "const int32 %s[] = {\n", prefixes_name.c_str());
351 for (absl::btree_map<int, string>::const_iterator it = prefixes.begin();
352 it != prefixes.end(); ++it) {
353 fprintf(output, " %d,\n", it->first);
354 possible_lengths.insert(static_cast<int>(log10(it->first) + 1));
355 }
356 fprintf(output,
357 "};\n"
358 "\n");
359
360 const string desc_name = var_name + "_descriptions";
361 fprintf(output, "const char* %s[] = {\n", desc_name.c_str());
362 for (absl::btree_map<int, string>::const_iterator it = prefixes.begin();
363 it != prefixes.end(); ++it) {
364 fprintf(output, " ");
365 WriteStringLiteral(it->second, output);
366 fprintf(output, ",\n");
367 }
368 fprintf(output,
369 "};\n"
370 "\n");
371
372 const string possible_lengths_name = var_name + "_possible_lengths";
373 fprintf(output, "const int32 %s[] = {\n ", possible_lengths_name.c_str());
374 for (absl::btree_set<int>::const_iterator it = possible_lengths.begin();
375 it != possible_lengths.end(); ++it) {
376 fprintf(output, " %d,", *it);
377 }
378 fprintf(output,
379 "\n"
380 "};\n"
381 "\n");
382
383 WritePrefixDescriptionsDefinition(var_name, prefixes_name, desc_name,
384 possible_lengths_name, output);
385 fprintf(output, "\n");
386 }
387
388 // Writes a pair of arrays mapping prefix language code pairs to
389 // PrefixDescriptions instances. "prefix_var_names" maps language code pairs
390 // to prefix variable names.
391 //
392 // const char* prefix_language_code_pairs[] = {
393 // "1_de",
394 // "1_en",
395 // };
396 //
397 // const PrefixDescriptions* prefix_descriptions[] = {
398 // &prefix_1_de,
399 // &prefix_1_en,
400 // };
401 //
WritePrefixesDescriptions(const absl::btree_map<string,string> & prefix_var_names,FILE * output)402 void WritePrefixesDescriptions(
403 const absl::btree_map<string, string>& prefix_var_names, FILE* output) {
404 fprintf(output, "const char* prefix_language_code_pairs[] = {\n");
405 for (absl::btree_map<string, string>::const_iterator it = prefix_var_names.begin();
406 it != prefix_var_names.end(); ++it) {
407 fprintf(output, " \"%s\",\n", it->first.c_str());
408 }
409 fprintf(output,
410 "};\n"
411 "\n"
412 "const PrefixDescriptions* prefixes_descriptions[] = {\n");
413 for (absl::btree_map<string, string>::const_iterator it = prefix_var_names.begin();
414 it != prefix_var_names.end(); ++it) {
415 fprintf(output, " &%s,\n", it->second.c_str());
416 }
417 fprintf(output,
418 "};\n"
419 "\n");
420 }
421
422 // For each entry in "languages" mapping a country calling code to a set
423 // of available languages, writes a sorted array of languages, then wraps it
424 // into a CountryLanguages instance. Finally, writes a pair of arrays mapping
425 // country calling codes to CountryLanguages instances.
426 //
427 // const char* country_1[] = {
428 // "de",
429 // "en",
430 // };
431 //
432 // const CountryLanguages country_1_languages = {
433 // country_1,
434 // sizeof(country_1)/sizeof(*country_1),
435 // };
436 //
437 // [...]
438 //
439 // const CountryLanguages* country_languages[] = {
440 // &country_1_languages,
441 // [...]
442 // }
443 //
444 // const int country_calling_codes[] = {
445 // 1,
446 // [...]
447 // };
448 //
WriteCountryLanguages(const map<int32,set<string>> & languages,FILE * output)449 bool WriteCountryLanguages(const map<int32, set<string> >& languages,
450 FILE* output) {
451 vector<string> country_languages_vars;
452 vector<string> countries;
453 for (map<int32, set<string> >::const_iterator it = languages.begin();
454 it != languages.end(); ++it) {
455 string country_code;
456 if (!IntToStr(it->first, &country_code)) {
457 return false;
458 }
459 const string country_var = "country_" + country_code;
460 fprintf(output, "const char* %s[] = {\n", country_var.c_str());
461 for (set<string>::const_iterator it_lang = it->second.begin();
462 it_lang != it->second.end(); ++it_lang) {
463 fprintf(output, " \"%s\",\n", it_lang->c_str());
464 }
465 fprintf(output,
466 "};\n"
467 "\n");
468
469 const string country_languages_var = country_var + "_languages";
470 fprintf(output, "const CountryLanguages %s = {\n",
471 country_languages_var.c_str());
472 WriteArrayAndSize(country_var, output);
473 fprintf(output,
474 "};\n"
475 "\n");
476 country_languages_vars.push_back(country_languages_var);
477 countries.push_back(country_code);
478 }
479
480 fprintf(output,
481 "\n"
482 "const CountryLanguages* countries_languages[] = {\n");
483 for (vector<string>::const_iterator
484 it_languages_var = country_languages_vars.begin();
485 it_languages_var != country_languages_vars.end(); ++it_languages_var) {
486 fprintf(output, " &%s,\n", it_languages_var->c_str());
487 }
488 fprintf(output,
489 "};\n"
490 "\n"
491 "const int country_calling_codes[] = {\n");
492 for (vector<string>::const_iterator it_country = countries.begin();
493 it_country != countries.end(); ++it_country) {
494 fprintf(output, " %s,\n", it_country->c_str());
495 }
496 fprintf(output,
497 "};\n"
498 "\n");
499 return true;
500 }
501
502 // Returns a copy of input where all occurences of pattern are replaced with
503 // value. If pattern is empty, input is returned unchanged.
ReplaceAll(const string & input,const string & pattern,const string & value)504 string ReplaceAll(const string& input, const string& pattern,
505 const string& value) {
506 if (pattern.size() == 0) {
507 return input;
508 }
509 string replaced;
510 std::back_insert_iterator<string> output = std::back_inserter(replaced);
511 string::const_iterator begin = input.begin(), end = begin;
512 while (true) {
513 const size_t pos = input.find(pattern, begin - input.begin());
514 if (pos == string::npos) {
515 std::copy(begin, input.end(), output);
516 break;
517 }
518 end = input.begin() + pos;
519 std::copy(begin, end, output);
520 std::copy(value.begin(), value.end(), output);
521 begin = end + pattern.length();
522 }
523 return replaced;
524 }
525
526 // Writes data accessor definitions, prefixed with "accessor_prefix".
WriteAccessorsDefinitions(const string & accessor_prefix,FILE * output)527 void WriteAccessorsDefinitions(const string& accessor_prefix, FILE* output) {
528 string templ =
529 "const int* get$prefix$_country_calling_codes() {\n"
530 " return country_calling_codes;\n"
531 "}\n"
532 "\n"
533 "int get$prefix$_country_calling_codes_size() {\n"
534 " return sizeof(country_calling_codes)\n"
535 " /sizeof(*country_calling_codes);\n"
536 "}\n"
537 "\n"
538 "const CountryLanguages* get$prefix$_country_languages(int index) {\n"
539 " return countries_languages[index];\n"
540 "}\n"
541 "\n"
542 "const char** get$prefix$_prefix_language_code_pairs() {\n"
543 " return prefix_language_code_pairs;\n"
544 "}\n"
545 "\n"
546 "int get$prefix$_prefix_language_code_pairs_size() {\n"
547 " return sizeof(prefix_language_code_pairs)\n"
548 " /sizeof(*prefix_language_code_pairs);\n"
549 "}\n"
550 "\n"
551 "const PrefixDescriptions* get$prefix$_prefix_descriptions(int index) {\n"
552 " return prefixes_descriptions[index];\n"
553 "}\n";
554 string defs = ReplaceAll(templ, "$prefix$", accessor_prefix);
555 fprintf(output, "%s", defs.c_str());
556 }
557
558 // Writes geocoding data .cc file. "data_path" is the path of geocoding textual
559 // data directory. "base_name" is the base name of the .h/.cc pair, like
560 // "geocoding_data".
WriteSource(const string & data_path,const string & base_name,const string & accessor_prefix,FILE * output)561 bool WriteSource(const string& data_path, const string& base_name,
562 const string& accessor_prefix, FILE* output) {
563 WriteLicense(output);
564 WriteCppHeader(base_name, output);
565 WriteNSHeader(output);
566 fprintf(output,
567 "namespace {\n"
568 "\n");
569
570 // Enumerate language/script directories.
571 absl::btree_map<string, string> prefix_vars;
572 map<int32, set<string> > country_languages;
573 vector<DirEntry> entries;
574 if (!ListDirectory(data_path, &entries)) {
575 fprintf(stderr, "failed to read directory entries");
576 return false;
577 }
578 for (vector<DirEntry>::const_iterator it = entries.begin();
579 it != entries.end(); ++it) {
580 if (it->kind() != kDirectory) {
581 continue;
582 }
583 // Enumerate country calling code files.
584 const string dir_path = data_path + "/" + it->name();
585 vector<DirEntry> files;
586 if (!ListDirectory(dir_path, &files)) {
587 fprintf(stderr, "failed to read file entries\n");
588 return false;
589 }
590 for (vector<DirEntry>::const_iterator it_files = files.begin();
591 it_files != files.end(); ++it_files) {
592 const string fname = it_files->name();
593 if (!EndsWith(fname, ".txt")) {
594 continue;
595 }
596 int32 country_code;
597 const string country_code_str = fname.substr(0, fname.length() - 4);
598 if (!StrToInt(country_code_str, &country_code)) {
599 return false;
600 }
601 const string path = dir_path + "/" + fname;
602
603 absl::btree_map<int32, string> prefixes;
604 if (!ParsePrefixes(path, &prefixes)) {
605 return false;
606 }
607
608 const string prefix_var = "prefix_" + country_code_str + "_" + it->name();
609 WritePrefixDescriptions(prefix_var, prefixes, output);
610 prefix_vars[country_code_str + "_" + it->name()] = prefix_var;
611 country_languages[country_code].insert(it->name());
612 }
613 }
614 WritePrefixesDescriptions(prefix_vars, output);
615 if (!WriteCountryLanguages(country_languages, output)) {
616 return false;
617 }
618 fprintf(output, "} // namespace\n");
619 fprintf(output, "\n");
620 WriteAccessorsDefinitions(accessor_prefix, output);
621 WriteNSFooter(output);
622 return ferror(output) == 0;
623 }
624
PrintHelp(const string & message)625 int PrintHelp(const string& message) {
626 fprintf(stderr, "error: %s\n", message.c_str());
627 fprintf(stderr, "generate_geocoding_data DATADIR CCPATH");
628 return 1;
629 }
630
Main(int argc,const char * argv[])631 int Main(int argc, const char* argv[]) {
632 if (argc < 2) {
633 return PrintHelp("geocoding data root directory expected");
634 }
635 if (argc < 3) {
636 return PrintHelp("output source path expected");
637 }
638 string accessor_prefix = "";
639 if (argc > 3) {
640 accessor_prefix = argv[3];
641 }
642 const string root_path(argv[1]);
643 string source_path(argv[2]);
644 std::replace(source_path.begin(), source_path.end(), '\\', '/');
645 string base_name = source_path;
646 if (base_name.rfind('/') != string::npos) {
647 base_name = base_name.substr(base_name.rfind('/') + 1);
648 }
649 base_name = base_name.substr(0, base_name.rfind('.'));
650
651 FILE* source_fp = fopen(source_path.c_str(), "w");
652 if (!source_fp) {
653 fprintf(stderr, "failed to open %s\n", source_path.c_str());
654 return 1;
655 }
656 AutoCloser<FILE> source_closer(&source_fp, fclose);
657 if (!WriteSource(root_path, base_name, accessor_prefix,
658 source_fp)) {
659 return 1;
660 }
661 return 0;
662 }
663
664 } // namespace phonenumbers
665 } // namespace i18n
666