1 // Copyright (C) 2012 The Libphonenumber Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: Patrick Mezard
16
17 #include "cpp-build/generate_geocoding_data.h"
18
19 #include <dirent.h>
20 #include <errno.h>
21 #include <locale>
22 #include <sys/stat.h>
23 #include <algorithm>
24 #include <cctype>
25 #include <cmath>
26 #include <cstdio>
27 #include <cstring>
28 #include <iomanip>
29 #include <iterator>
30 #include <map>
31 #include <set>
32 #include <sstream>
33 #include <string>
34 #include <utility>
35 #include <vector>
36
37 #include "base/basictypes.h"
38
39 namespace i18n {
40 namespace phonenumbers {
41
42 using std::map;
43 using std::string;
44 using std::vector;
45 using std::set;
46 using std::pair;
47
48 template <typename ResourceType> class AutoCloser {
49 public:
50 typedef int (*ReleaseFunction) (ResourceType* resource);
51
AutoCloser(ResourceType ** resource,ReleaseFunction release_function)52 AutoCloser(ResourceType** resource, ReleaseFunction release_function)
53 : resource_(resource),
54 release_function_(release_function)
55 {}
56
~AutoCloser()57 ~AutoCloser() {
58 Close();
59 }
60
get_resource() const61 ResourceType* get_resource() const {
62 return *resource_;
63 }
64
Close()65 void Close() {
66 if (*resource_) {
67 release_function_(*resource_);
68 *resource_ = NULL;
69 }
70 }
71
72 private:
73 ResourceType** resource_;
74 ReleaseFunction release_function_;
75 };
76
77 enum DirEntryKinds {
78 kFile = 0,
79 kDirectory = 1,
80 };
81
82 class DirEntry {
83 public:
DirEntry(const char * n,DirEntryKinds k)84 DirEntry(const char* n, DirEntryKinds k)
85 : name_(n),
86 kind_(k)
87 {}
88
name() const89 const std::string& name() const { return name_; }
kind() const90 DirEntryKinds kind() const { return kind_; }
91
92 private:
93 std::string name_;
94 DirEntryKinds kind_;
95 };
96
97 // Lists directory entries in path. "." and ".." are excluded. Returns true on
98 // success.
ListDirectory(const string & path,vector<DirEntry> * entries)99 bool ListDirectory(const string& path, vector<DirEntry>* entries) {
100 entries->clear();
101 DIR* dir = opendir(path.c_str());
102 if (!dir) {
103 return false;
104 }
105 AutoCloser<DIR> dir_closer(&dir, closedir);
106 struct dirent *entry;
107 struct stat entry_stat;
108 while (true) {
109 // Set errno to 0 to be able to check if an error occurs during the
110 // readdir() call. NULL is the return value when the end of the directory
111 // stream is reached or when an error occurs, and the errno check is the
112 // only thing that helps us distinguish between the two cases. See
113 // documentation at
114 // http://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html
115 errno = 0;
116 entry = readdir(dir);
117 if (entry == NULL) {
118 return errno == 0;
119 }
120 if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
121 continue;
122 }
123 const string entry_path = path + "/" + entry->d_name;
124 if (stat(entry_path.c_str(), &entry_stat)) {
125 return false;
126 }
127 DirEntryKinds kind = kFile;
128 if (S_ISDIR(entry_stat.st_mode)) {
129 kind = kDirectory;
130 } else if (!S_ISREG(entry_stat.st_mode)) {
131 continue;
132 }
133 entries->push_back(DirEntry(entry->d_name, kind));
134 }
135 }
136
137 // Returns true if s ends with suffix.
EndsWith(const string & s,const string & suffix)138 bool EndsWith(const string& s, const string& suffix) {
139 if (suffix.length() > s.length()) {
140 return false;
141 }
142 return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
143 }
144
145 // Converts string to integer, returns true on success.
StrToInt(const string & s,int32 * n)146 bool StrToInt(const string& s, int32* n) {
147 std::stringstream stream;
148 stream << s;
149 stream >> *n;
150 return !stream.fail();
151 }
152
153 // Converts integer to string, returns true on success.
IntToStr(int32 n,string * s)154 bool IntToStr(int32 n, string* s) {
155 std::stringstream stream;
156 stream << n;
157 stream >> *s;
158 return !stream.fail();
159 }
160
161 // Parses the prefix descriptions file at path, clears and fills the output
162 // prefixes phone number prefix to description mapping.
163 // Returns true on success.
ParsePrefixes(const string & path,map<int32,string> * prefixes)164 bool ParsePrefixes(const string& path, map<int32, string>* prefixes) {
165 prefixes->clear();
166 FILE* input = fopen(path.c_str(), "r");
167 if (!input) {
168 return false;
169 }
170 AutoCloser<FILE> input_closer(&input, fclose);
171 const int kMaxLineLength = 2*1024;
172 vector<char> buffer(kMaxLineLength);
173 vector<char>::iterator begin, end, sep;
174 string prefix, description;
175 int32 prefix_code;
176 while (fgets(&buffer[0], buffer.size(), input)) {
177 begin = buffer.begin();
178 end = std::find(begin, buffer.end(), '\0');
179 if (end == begin) {
180 continue;
181 }
182 --end;
183 if (*end != '\n' && !feof(input)) {
184 // A line without LF can only happen at the end of file.
185 return false;
186 }
187
188 // Trim and check for comments.
189 for (; begin != end && std::isspace(*begin); ++begin) {}
190 for (; end != begin && std::isspace(*(end - 1)); --end) {}
191 if (begin == end || *begin == '#') {
192 continue;
193 }
194
195 sep = std::find(begin, end, '|');
196 if (sep == end) {
197 continue;
198 }
199 prefix = string(begin, sep);
200 if (!StrToInt(prefix, &prefix_code)) {
201 return false;
202 }
203 (*prefixes)[prefix_code] = string(sep + 1, end);
204 }
205 return ferror(input) == 0;
206 }
207
208 // Builds a C string literal from s. The output is enclosed in double-quotes and
209 // care is taken to escape input quotes and non-ASCII or control characters.
210 //
211 // An input string:
212 // Op\xc3\xa9ra
213 // becomes:
214 // "Op""\xc3""\xa9""ra"
MakeStringLiteral(const string & s)215 string MakeStringLiteral(const string& s) {
216 std::stringstream buffer;
217 int prev_is_hex = 0;
218 buffer << std::hex << std::setfill('0');
219 buffer << "\"";
220 for (string::const_iterator it = s.begin(); it != s.end(); ++it) {
221 const char c = *it;
222 if (c >= 32 && c < 127) {
223 if (prev_is_hex == 2) {
224 buffer << "\"\"";
225 }
226 if (c == '\'') {
227 buffer << "\\";
228 }
229 buffer << c;
230 prev_is_hex = 1;
231 } else {
232 if (prev_is_hex != 0) {
233 buffer << "\"\"";
234 }
235 buffer << "\\x" << std::setw(2) << (c < 0 ? c + 256 : c);
236 prev_is_hex = 2;
237 }
238 }
239 buffer << "\"";
240 return buffer.str();
241 }
242
WriteStringLiteral(const string & s,FILE * output)243 void WriteStringLiteral(const string& s, FILE* output) {
244 string literal = MakeStringLiteral(s);
245 fprintf(output, "%s", literal.c_str());
246 }
247
248 const char kLicense[] =
249 "// Copyright (C) 2012 The Libphonenumber Authors\n"
250 "//\n"
251 "// Licensed under the Apache License, Version 2.0 (the \"License\");\n"
252 "// you may not use this file except in compliance with the License.\n"
253 "// You may obtain a copy of the License at\n"
254 "//\n"
255 "// http://www.apache.org/licenses/LICENSE-2.0\n"
256 "//\n"
257 "// Unless required by applicable law or agreed to in writing, software\n"
258 "// distributed under the License is distributed on an \"AS IS\" BASIS,\n"
259 "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or "
260 "implied.\n"
261 "// See the License for the specific language governing permissions and\n"
262 "// limitations under the License.\n"
263 "//\n"
264 "// This file is generated automatically, do not edit it manually.\n"
265 "\n";
266
WriteLicense(FILE * output)267 void WriteLicense(FILE* output) {
268 fprintf(output, "%s", kLicense);
269 }
270
271 const char kI18NNS[] = "i18n";
272 const char kPhoneNumbersNS[] = "phonenumbers";
273
WriteNSHeader(FILE * output)274 void WriteNSHeader(FILE* output) {
275 fprintf(output, "namespace %s {\n", kI18NNS);
276 fprintf(output, "namespace %s {\n", kPhoneNumbersNS);
277 }
278
WriteNSFooter(FILE * output)279 void WriteNSFooter(FILE* output) {
280 fprintf(output, "} // namespace %s\n", kPhoneNumbersNS);
281 fprintf(output, "} // namespace %s\n", kI18NNS);
282 }
283
WriteCppHeader(const string & base_name,FILE * output)284 void WriteCppHeader(const string& base_name, FILE* output) {
285 fprintf(output, "#include \"phonenumbers/geocoding/%s.h\"\n",
286 base_name.c_str());
287 fprintf(output, "\n");
288 fprintf(output, "#include \"phonenumbers/base/basictypes.h\"\n");
289 fprintf(output, "\n");
290 }
291
WriteArrayAndSize(const string & name,FILE * output)292 void WriteArrayAndSize(const string& name, FILE* output) {
293 fprintf(output, " %s,\n", name.c_str());
294 fprintf(output, " sizeof(%s)/sizeof(*%s),\n", name.c_str(), name.c_str());
295 }
296
297 // Writes a PrefixDescriptions variable named "name", with its prefixes field
298 // set to "prefixes_name" variable, its descriptions to "desc_name" and its
299 // possible_lengths to "possible_lengths_name":
300 //
301 // const PrefixDescriptions ${name} = {
302 // ${prefix_name},
303 // sizeof(${prefix_name})/sizeof(*${prefix_name}),
304 // ${desc_name},
305 // ${possible_lengths_name},
306 // sizeof(${possible_lengths_name})/sizeof(*${possible_lengths_name}),
307 // };
308 //
WritePrefixDescriptionsDefinition(const string & name,const string & prefixes_name,const string & desc_name,const string & possible_lengths_name,FILE * output)309 void WritePrefixDescriptionsDefinition(
310 const string& name, const string& prefixes_name, const string& desc_name,
311 const string& possible_lengths_name, FILE* output) {
312 fprintf(output, "const PrefixDescriptions %s = {\n", name.c_str());
313 WriteArrayAndSize(prefixes_name, output);
314 fprintf(output, " %s,\n", desc_name.c_str());
315 WriteArrayAndSize(possible_lengths_name, output);
316 fprintf(output, "};\n");
317 }
318
319 // Writes prefixes, descriptions and possible_lengths arrays built from the
320 // phone number prefix to description mapping "prefixes". Binds these arrays
321 // in a single PrefixDescriptions variable named "var_name".
322 //
323 // const int32 ${var_name}_prefixes[] = {
324 // 1201,
325 // 1650,
326 // };
327 //
328 // const char* ${var_name}_descriptions[] = {
329 // "New Jerse",
330 // "Kalifornie",
331 // };
332 //
333 // const int32 ${var_name}_possible_lengths[] = {
334 // 4,
335 // };
336 //
337 // const PrefixDescriptions ${var_name} = {
338 // ...
339 // };
340 //
WritePrefixDescriptions(const string & var_name,const map<int,string> & prefixes,FILE * output)341 void WritePrefixDescriptions(const string& var_name, const map<int, string>&
342 prefixes, FILE* output) {
343 set<int> possible_lengths;
344 const string prefixes_name = var_name + "_prefixes";
345 fprintf(output, "const int32 %s[] = {\n", prefixes_name.c_str());
346 for (map<int, string>::const_iterator it = prefixes.begin();
347 it != prefixes.end(); ++it) {
348 fprintf(output, " %d,\n", it->first);
349 possible_lengths.insert(static_cast<int>(log10(it->first) + 1));
350 }
351 fprintf(output,
352 "};\n"
353 "\n");
354
355 const string desc_name = var_name + "_descriptions";
356 fprintf(output, "const char* %s[] = {\n", desc_name.c_str());
357 for (map<int, string>::const_iterator it = prefixes.begin();
358 it != prefixes.end(); ++it) {
359 fprintf(output, " ");
360 WriteStringLiteral(it->second, output);
361 fprintf(output, ",\n");
362 }
363 fprintf(output,
364 "};\n"
365 "\n");
366
367 const string possible_lengths_name = var_name + "_possible_lengths";
368 fprintf(output, "const int32 %s[] = {\n ", possible_lengths_name.c_str());
369 for (set<int>::const_iterator it = possible_lengths.begin();
370 it != possible_lengths.end(); ++it) {
371 fprintf(output, " %d,", *it);
372 }
373 fprintf(output,
374 "\n"
375 "};\n"
376 "\n");
377
378 WritePrefixDescriptionsDefinition(var_name, prefixes_name, desc_name,
379 possible_lengths_name, output);
380 fprintf(output, "\n");
381 }
382
383 // Writes a pair of arrays mapping prefix language code pairs to
384 // PrefixDescriptions instances. "prefix_var_names" maps language code pairs
385 // to prefix variable names.
386 //
387 // const char* prefix_language_code_pairs[] = {
388 // "1_de",
389 // "1_en",
390 // };
391 //
392 // const PrefixDescriptions* prefix_descriptions[] = {
393 // &prefix_1_de,
394 // &prefix_1_en,
395 // };
396 //
WritePrefixesDescriptions(const map<string,string> & prefix_var_names,FILE * output)397 void WritePrefixesDescriptions(const map<string, string>& prefix_var_names,
398 FILE* output) {
399 fprintf(output, "const char* prefix_language_code_pairs[] = {\n");
400 for (map<string, string>::const_iterator it = prefix_var_names.begin();
401 it != prefix_var_names.end(); ++it) {
402 fprintf(output, " \"%s\",\n", it->first.c_str());
403 }
404 fprintf(output,
405 "};\n"
406 "\n"
407 "const PrefixDescriptions* prefixes_descriptions[] = {\n");
408 for (map<string, string>::const_iterator it = prefix_var_names.begin();
409 it != prefix_var_names.end(); ++it) {
410 fprintf(output, " &%s,\n", it->second.c_str());
411 }
412 fprintf(output,
413 "};\n"
414 "\n");
415 }
416
417 // For each entry in "languages" mapping a country calling code to a set
418 // of available languages, writes a sorted array of languages, then wraps it
419 // into a CountryLanguages instance. Finally, writes a pair of arrays mapping
420 // country calling codes to CountryLanguages instances.
421 //
422 // const char* country_1[] = {
423 // "de",
424 // "en",
425 // };
426 //
427 // const CountryLanguages country_1_languages = {
428 // country_1,
429 // sizeof(country_1)/sizeof(*country_1),
430 // };
431 //
432 // [...]
433 //
434 // const CountryLanguages* country_languages[] = {
435 // &country_1_languages,
436 // [...]
437 // }
438 //
439 // const int country_calling_codes[] = {
440 // 1,
441 // [...]
442 // };
443 //
WriteCountryLanguages(const map<int32,set<string>> & languages,FILE * output)444 bool WriteCountryLanguages(const map<int32, set<string> >& languages,
445 FILE* output) {
446 vector<string> country_languages_vars;
447 vector<string> countries;
448 for (map<int32, set<string> >::const_iterator it = languages.begin();
449 it != languages.end(); ++it) {
450 string country_code;
451 if (!IntToStr(it->first, &country_code)) {
452 return false;
453 }
454 const string country_var = "country_" + country_code;
455 fprintf(output, "const char* %s[] = {\n", country_var.c_str());
456 for (set<string>::const_iterator it_lang = it->second.begin();
457 it_lang != it->second.end(); ++it_lang) {
458 fprintf(output, " \"%s\",\n", it_lang->c_str());
459 }
460 fprintf(output,
461 "};\n"
462 "\n");
463
464 const string country_languages_var = country_var + "_languages";
465 fprintf(output, "const CountryLanguages %s = {\n",
466 country_languages_var.c_str());
467 WriteArrayAndSize(country_var, output);
468 fprintf(output,
469 "};\n"
470 "\n");
471 country_languages_vars.push_back(country_languages_var);
472 countries.push_back(country_code);
473 }
474
475 fprintf(output,
476 "\n"
477 "const CountryLanguages* countries_languages[] = {\n");
478 for (vector<string>::const_iterator
479 it_languages_var = country_languages_vars.begin();
480 it_languages_var != country_languages_vars.end(); ++it_languages_var) {
481 fprintf(output, " &%s,\n", it_languages_var->c_str());
482 }
483 fprintf(output,
484 "};\n"
485 "\n"
486 "const int country_calling_codes[] = {\n");
487 for (vector<string>::const_iterator it_country = countries.begin();
488 it_country != countries.end(); ++it_country) {
489 fprintf(output, " %s,\n", it_country->c_str());
490 }
491 fprintf(output,
492 "};\n"
493 "\n");
494 return true;
495 }
496
497 // Returns a copy of input where all occurences of pattern are replaced with
498 // value. If pattern is empty, input is returned unchanged.
ReplaceAll(const string & input,const string & pattern,const string & value)499 string ReplaceAll(const string& input, const string& pattern,
500 const string& value) {
501 if (pattern.size() == 0) {
502 return input;
503 }
504 string replaced;
505 std::back_insert_iterator<string> output = std::back_inserter(replaced);
506 string::const_iterator begin = input.begin(), end = begin;
507 while (true) {
508 const size_t pos = input.find(pattern, begin - input.begin());
509 if (pos == string::npos) {
510 std::copy(begin, input.end(), output);
511 break;
512 }
513 end = input.begin() + pos;
514 std::copy(begin, end, output);
515 std::copy(value.begin(), value.end(), output);
516 begin = end + pattern.length();
517 }
518 return replaced;
519 }
520
521 // Writes data accessor definitions, prefixed with "accessor_prefix".
WriteAccessorsDefinitions(const string & accessor_prefix,FILE * output)522 void WriteAccessorsDefinitions(const string& accessor_prefix, FILE* output) {
523 string templ =
524 "const int* get$prefix$_country_calling_codes() {\n"
525 " return country_calling_codes;\n"
526 "}\n"
527 "\n"
528 "int get$prefix$_country_calling_codes_size() {\n"
529 " return sizeof(country_calling_codes)\n"
530 " /sizeof(*country_calling_codes);\n"
531 "}\n"
532 "\n"
533 "const CountryLanguages* get$prefix$_country_languages(int index) {\n"
534 " return countries_languages[index];\n"
535 "}\n"
536 "\n"
537 "const char** get$prefix$_prefix_language_code_pairs() {\n"
538 " return prefix_language_code_pairs;\n"
539 "}\n"
540 "\n"
541 "int get$prefix$_prefix_language_code_pairs_size() {\n"
542 " return sizeof(prefix_language_code_pairs)\n"
543 " /sizeof(*prefix_language_code_pairs);\n"
544 "}\n"
545 "\n"
546 "const PrefixDescriptions* get$prefix$_prefix_descriptions(int index) {\n"
547 " return prefixes_descriptions[index];\n"
548 "}\n";
549 string defs = ReplaceAll(templ, "$prefix$", accessor_prefix);
550 fprintf(output, "%s", defs.c_str());
551 }
552
553 // Writes geocoding data .cc file. "data_path" is the path of geocoding textual
554 // data directory. "base_name" is the base name of the .h/.cc pair, like
555 // "geocoding_data".
WriteSource(const string & data_path,const string & base_name,const string & accessor_prefix,FILE * output)556 bool WriteSource(const string& data_path, const string& base_name,
557 const string& accessor_prefix, FILE* output) {
558 WriteLicense(output);
559 WriteCppHeader(base_name, output);
560 WriteNSHeader(output);
561 fprintf(output,
562 "namespace {\n"
563 "\n");
564
565 // Enumerate language/script directories.
566 map<string, string> prefix_vars;
567 map<int32, set<string> > country_languages;
568 vector<DirEntry> entries;
569 if (!ListDirectory(data_path, &entries)) {
570 fprintf(stderr, "failed to read directory entries");
571 return false;
572 }
573 for (vector<DirEntry>::const_iterator it = entries.begin();
574 it != entries.end(); ++it) {
575 if (it->kind() != kDirectory) {
576 continue;
577 }
578 // Enumerate country calling code files.
579 const string dir_path = data_path + "/" + it->name();
580 vector<DirEntry> files;
581 if (!ListDirectory(dir_path, &files)) {
582 fprintf(stderr, "failed to read file entries\n");
583 return false;
584 }
585 for (vector<DirEntry>::const_iterator it_files = files.begin();
586 it_files != files.end(); ++it_files) {
587 const string fname = it_files->name();
588 if (!EndsWith(fname, ".txt")) {
589 continue;
590 }
591 int32 country_code;
592 const string country_code_str = fname.substr(0, fname.length() - 4);
593 if (!StrToInt(country_code_str, &country_code)) {
594 return false;
595 }
596 const string path = dir_path + "/" + fname;
597
598 map<int32, string> prefixes;
599 if (!ParsePrefixes(path, &prefixes)) {
600 return false;
601 }
602
603 const string prefix_var = "prefix_" + country_code_str + "_" + it->name();
604 WritePrefixDescriptions(prefix_var, prefixes, output);
605 prefix_vars[country_code_str + "_" + it->name()] = prefix_var;
606 country_languages[country_code].insert(it->name());
607 }
608 }
609 WritePrefixesDescriptions(prefix_vars, output);
610 if (!WriteCountryLanguages(country_languages, output)) {
611 return false;
612 }
613 fprintf(output, "} // namespace\n");
614 fprintf(output, "\n");
615 WriteAccessorsDefinitions(accessor_prefix, output);
616 WriteNSFooter(output);
617 return ferror(output) == 0;
618 }
619
PrintHelp(const string & message)620 int PrintHelp(const string& message) {
621 fprintf(stderr, "error: %s\n", message.c_str());
622 fprintf(stderr, "generate_geocoding_data DATADIR CCPATH");
623 return 1;
624 }
625
Main(int argc,const char * argv[])626 int Main(int argc, const char* argv[]) {
627 if (argc < 2) {
628 return PrintHelp("geocoding data root directory expected");
629 }
630 if (argc < 3) {
631 return PrintHelp("output source path expected");
632 }
633 string accessor_prefix = "";
634 if (argc > 3) {
635 accessor_prefix = argv[3];
636 }
637 const string root_path(argv[1]);
638 string source_path(argv[2]);
639 std::replace(source_path.begin(), source_path.end(), '\\', '/');
640 string base_name = source_path;
641 if (base_name.rfind('/') != string::npos) {
642 base_name = base_name.substr(base_name.rfind('/') + 1);
643 }
644 base_name = base_name.substr(0, base_name.rfind('.'));
645
646 FILE* source_fp = fopen(source_path.c_str(), "w");
647 if (!source_fp) {
648 fprintf(stderr, "failed to open %s\n", source_path.c_str());
649 return 1;
650 }
651 AutoCloser<FILE> source_closer(&source_fp, fclose);
652 if (!WriteSource(root_path, base_name, accessor_prefix,
653 source_fp)) {
654 return 1;
655 }
656 return 0;
657 }
658
659 } // namespace phonenumbers
660 } // namespace i18n
661