1 // Copyright (C) 2014 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include <libaddressinput/address_formatter.h>
16
17 #include <libaddressinput/address_data.h>
18 #include <libaddressinput/address_field.h>
19 #include <libaddressinput/util/basictypes.h>
20
21 #include <algorithm>
22 #include <cassert>
23 #include <cstddef>
24 #include <functional>
25 #include <string>
26 #include <vector>
27
28 #include "format_element.h"
29 #include "language.h"
30 #include "region_data_constants.h"
31 #include "rule.h"
32 #include "util/cctype_tolower_equal.h"
33
34 namespace i18n {
35 namespace addressinput {
36
37 namespace {
38
39 const char kCommaSeparator[] = ", ";
40 const char kSpaceSeparator[] = " ";
41 const char kArabicCommaSeparator[] = "\xD8\x8C" " "; /* "، " */
42
43 const char* kLanguagesThatUseSpace[] = {
44 "th",
45 "ko"
46 };
47
48 const char* kLanguagesThatHaveNoSeparator[] = {
49 "ja",
50 "zh" // All Chinese variants.
51 };
52
53 // This data is based on CLDR, for languages that are in official use in some
54 // country, where Arabic is the most likely script tag.
55 // TODO: Consider supporting variants such as tr-Arab by detecting the script
56 // code.
57 const char* kLanguagesThatUseAnArabicComma[] = {
58 "ar",
59 "az",
60 "fa",
61 "kk",
62 "ku",
63 "ky",
64 "ps",
65 "tg",
66 "tk",
67 "ur",
68 "uz"
69 };
70
GetLineSeparatorForLanguage(const std::string & language_tag)71 std::string GetLineSeparatorForLanguage(const std::string& language_tag) {
72 Language address_language(language_tag);
73
74 // First deal with explicit script tags.
75 if (address_language.has_latin_script) {
76 return kCommaSeparator;
77 }
78
79 // Now guess something appropriate based on the base language.
80 const std::string& base_language = address_language.base;
81 if (std::find_if(kLanguagesThatUseSpace,
82 kLanguagesThatUseSpace + arraysize(kLanguagesThatUseSpace),
83 std::bind2nd(EqualToTolowerString(), base_language)) !=
84 kLanguagesThatUseSpace + arraysize(kLanguagesThatUseSpace)) {
85 return kSpaceSeparator;
86 } else if (std::find_if(
87 kLanguagesThatHaveNoSeparator,
88 kLanguagesThatHaveNoSeparator +
89 arraysize(kLanguagesThatHaveNoSeparator),
90 std::bind2nd(EqualToTolowerString(), base_language)) !=
91 kLanguagesThatHaveNoSeparator +
92 arraysize(kLanguagesThatHaveNoSeparator)) {
93 return "";
94 } else if (std::find_if(
95 kLanguagesThatUseAnArabicComma,
96 kLanguagesThatUseAnArabicComma +
97 arraysize(kLanguagesThatUseAnArabicComma),
98 std::bind2nd(EqualToTolowerString(), base_language)) !=
99 kLanguagesThatUseAnArabicComma +
100 arraysize(kLanguagesThatUseAnArabicComma)) {
101 return kArabicCommaSeparator;
102 }
103 // Either the language is a Latin-script language, or no language was
104 // specified. In the latter case we still return ", " as the most common
105 // separator in use. In countries that don't use this, e.g. Thailand,
106 // addresses are often written in Latin script where this would still be
107 // appropriate, so this is a reasonable default in the absence of information.
108 return kCommaSeparator;
109 }
110
CombineLinesForLanguage(const std::vector<std::string> & lines,const std::string & language_tag,std::string * line)111 void CombineLinesForLanguage(const std::vector<std::string>& lines,
112 const std::string& language_tag,
113 std::string* line) {
114 line->clear();
115 std::string separator = GetLineSeparatorForLanguage(language_tag);
116 for (std::vector<std::string>::const_iterator it = lines.begin();
117 it != lines.end();
118 ++it) {
119 if (it != lines.begin()) {
120 line->append(separator);
121 }
122 line->append(*it);
123 }
124 }
125
126 } // namespace
127
GetFormattedNationalAddress(const AddressData & address_data,std::vector<std::string> * lines)128 void GetFormattedNationalAddress(
129 const AddressData& address_data, std::vector<std::string>* lines) {
130 assert(lines != NULL);
131 lines->clear();
132
133 Rule rule;
134 rule.CopyFrom(Rule::GetDefault());
135 // TODO: Eventually, we should get the best rule for this country and
136 // language, rather than just for the country.
137 rule.ParseSerializedRule(RegionDataConstants::GetRegionData(
138 address_data.region_code));
139
140 Language language(address_data.language_code);
141
142 // If Latin-script rules are available and the |language_code| of this address
143 // is explicitly tagged as being Latin, then use the Latin-script formatting
144 // rules.
145 const std::vector<FormatElement>& format =
146 language.has_latin_script && !rule.GetLatinFormat().empty()
147 ? rule.GetLatinFormat()
148 : rule.GetFormat();
149
150 // Address format without the unnecessary elements (based on which address
151 // fields are empty). We assume all literal strings that are not at the start
152 // or end of a line are separators, and therefore only relevant if the
153 // surrounding fields are filled in. This works with the data we have
154 // currently.
155 std::vector<FormatElement> pruned_format;
156 for (std::vector<FormatElement>::const_iterator
157 element_it = format.begin();
158 element_it != format.end();
159 ++element_it) {
160 // Always keep the newlines.
161 if (element_it->IsNewline() ||
162 // Always keep the non-empty address fields.
163 (element_it->IsField() &&
164 !address_data.IsFieldEmpty(element_it->GetField())) ||
165 // Only keep literals that satisfy these 2 conditions:
166 (!element_it->IsField() &&
167 // (1) Not preceding an empty field.
168 (element_it + 1 == format.end() ||
169 !(element_it + 1)->IsField() ||
170 !address_data.IsFieldEmpty((element_it + 1)->GetField())) &&
171 // (2) Not following a removed field.
172 (element_it == format.begin() ||
173 !(element_it - 1)->IsField() ||
174 (!pruned_format.empty() && pruned_format.back().IsField())))) {
175 pruned_format.push_back(*element_it);
176 }
177 }
178
179 std::string line;
180 for (std::vector<FormatElement>::const_iterator
181 element_it = pruned_format.begin();
182 element_it != pruned_format.end();
183 ++element_it) {
184 if (element_it->IsNewline()) {
185 if (!line.empty()) {
186 lines->push_back(line);
187 line.clear();
188 }
189 } else if (element_it->IsField()) {
190 AddressField field = element_it->GetField();
191 if (field == STREET_ADDRESS) {
192 // The field "street address" represents the street address lines of an
193 // address, so there can be multiple values.
194 if (!address_data.IsFieldEmpty(field)) {
195 line.append(address_data.address_line.front());
196 if (address_data.address_line.size() > 1U) {
197 lines->push_back(line);
198 line.clear();
199 lines->insert(lines->end(),
200 address_data.address_line.begin() + 1,
201 address_data.address_line.end());
202 }
203 }
204 } else {
205 line.append(address_data.GetFieldValue(field));
206 }
207 } else {
208 line.append(element_it->GetLiteral());
209 }
210 }
211 if (!line.empty()) {
212 lines->push_back(line);
213 }
214 }
215
GetFormattedNationalAddressLine(const AddressData & address_data,std::string * line)216 void GetFormattedNationalAddressLine(
217 const AddressData& address_data, std::string* line) {
218 std::vector<std::string> address_lines;
219 GetFormattedNationalAddress(address_data, &address_lines);
220 CombineLinesForLanguage(address_lines, address_data.language_code, line);
221 }
222
GetStreetAddressLinesAsSingleLine(const AddressData & address_data,std::string * line)223 void GetStreetAddressLinesAsSingleLine(
224 const AddressData& address_data, std::string* line) {
225 CombineLinesForLanguage(
226 address_data.address_line, address_data.language_code, line);
227 }
228
229 } // namespace addressinput
230 } // namespace i18n
231