1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "compile/Pseudolocalizer.h"
18
19 #include "util/Util.h"
20
21 using android::StringPiece;
22
23 namespace aapt {
24
25 // String basis to generate expansion
26 static const std::string kExpansionString =
27 "one two three "
28 "four five six seven eight nine ten eleven twelve thirteen "
29 "fourteen fiveteen sixteen seventeen nineteen twenty";
30
31 // Special unicode characters to override directionality of the words
32 static const std::string kRlm = "\u200f";
33 static const std::string kRlo = "\u202e";
34 static const std::string kPdf = "\u202c";
35
36 // Placeholder marks
37 static const std::string kPlaceholderOpen = "\u00bb";
38 static const std::string kPlaceholderClose = "\u00ab";
39
40 static const char kArgStart = '{';
41 static const char kArgEnd = '}';
42
43 class PseudoMethodNone : public PseudoMethodImpl {
44 public:
Text(const StringPiece & text)45 std::string Text(const StringPiece& text) override { return text.to_string(); }
Placeholder(const StringPiece & text)46 std::string Placeholder(const StringPiece& text) override { return text.to_string(); }
47 };
48
49 class PseudoMethodBidi : public PseudoMethodImpl {
50 public:
51 std::string Text(const StringPiece& text) override;
52 std::string Placeholder(const StringPiece& text) override;
53 };
54
55 class PseudoMethodAccent : public PseudoMethodImpl {
56 public:
PseudoMethodAccent()57 PseudoMethodAccent() : depth_(0), word_count_(0), length_(0) {}
58 std::string Start() override;
59 std::string End() override;
60 std::string Text(const StringPiece& text) override;
61 std::string Placeholder(const StringPiece& text) override;
62
63 private:
64 size_t depth_;
65 size_t word_count_;
66 size_t length_;
67 };
68
Pseudolocalizer(Method method)69 Pseudolocalizer::Pseudolocalizer(Method method) : last_depth_(0) {
70 SetMethod(method);
71 }
72
SetMethod(Method method)73 void Pseudolocalizer::SetMethod(Method method) {
74 switch (method) {
75 case Method::kNone:
76 impl_ = util::make_unique<PseudoMethodNone>();
77 break;
78 case Method::kAccent:
79 impl_ = util::make_unique<PseudoMethodAccent>();
80 break;
81 case Method::kBidi:
82 impl_ = util::make_unique<PseudoMethodBidi>();
83 break;
84 }
85 }
86
Text(const StringPiece & text)87 std::string Pseudolocalizer::Text(const StringPiece& text) {
88 std::string out;
89 size_t depth = last_depth_;
90 size_t lastpos, pos;
91 const size_t length = text.size();
92 const char* str = text.data();
93 bool escaped = false;
94 for (lastpos = pos = 0; pos < length; pos++) {
95 char16_t c = str[pos];
96 if (escaped) {
97 escaped = false;
98 continue;
99 }
100 if (c == '\'') {
101 escaped = true;
102 continue;
103 }
104
105 if (c == kArgStart) {
106 depth++;
107 } else if (c == kArgEnd && depth) {
108 depth--;
109 }
110
111 if (last_depth_ != depth || pos == length - 1) {
112 bool pseudo = ((last_depth_ % 2) == 0);
113 size_t nextpos = pos;
114 if (!pseudo || depth == last_depth_) {
115 nextpos++;
116 }
117 size_t size = nextpos - lastpos;
118 if (size) {
119 std::string chunk = text.substr(lastpos, size).to_string();
120 if (pseudo) {
121 chunk = impl_->Text(chunk);
122 } else if (str[lastpos] == kArgStart && str[nextpos - 1] == kArgEnd) {
123 chunk = impl_->Placeholder(chunk);
124 }
125 out.append(chunk);
126 }
127 if (pseudo && depth < last_depth_) { // End of message
128 out.append(impl_->End());
129 } else if (!pseudo && depth > last_depth_) { // Start of message
130 out.append(impl_->Start());
131 }
132 lastpos = nextpos;
133 last_depth_ = depth;
134 }
135 }
136 return out;
137 }
138
PseudolocalizeChar(const char c)139 static const char* PseudolocalizeChar(const char c) {
140 switch (c) {
141 case 'a':
142 return "\u00e5";
143 case 'b':
144 return "\u0253";
145 case 'c':
146 return "\u00e7";
147 case 'd':
148 return "\u00f0";
149 case 'e':
150 return "\u00e9";
151 case 'f':
152 return "\u0192";
153 case 'g':
154 return "\u011d";
155 case 'h':
156 return "\u0125";
157 case 'i':
158 return "\u00ee";
159 case 'j':
160 return "\u0135";
161 case 'k':
162 return "\u0137";
163 case 'l':
164 return "\u013c";
165 case 'm':
166 return "\u1e3f";
167 case 'n':
168 return "\u00f1";
169 case 'o':
170 return "\u00f6";
171 case 'p':
172 return "\u00fe";
173 case 'q':
174 return "\u0051";
175 case 'r':
176 return "\u0155";
177 case 's':
178 return "\u0161";
179 case 't':
180 return "\u0163";
181 case 'u':
182 return "\u00fb";
183 case 'v':
184 return "\u0056";
185 case 'w':
186 return "\u0175";
187 case 'x':
188 return "\u0445";
189 case 'y':
190 return "\u00fd";
191 case 'z':
192 return "\u017e";
193 case 'A':
194 return "\u00c5";
195 case 'B':
196 return "\u03b2";
197 case 'C':
198 return "\u00c7";
199 case 'D':
200 return "\u00d0";
201 case 'E':
202 return "\u00c9";
203 case 'G':
204 return "\u011c";
205 case 'H':
206 return "\u0124";
207 case 'I':
208 return "\u00ce";
209 case 'J':
210 return "\u0134";
211 case 'K':
212 return "\u0136";
213 case 'L':
214 return "\u013b";
215 case 'M':
216 return "\u1e3e";
217 case 'N':
218 return "\u00d1";
219 case 'O':
220 return "\u00d6";
221 case 'P':
222 return "\u00de";
223 case 'Q':
224 return "\u0071";
225 case 'R':
226 return "\u0154";
227 case 'S':
228 return "\u0160";
229 case 'T':
230 return "\u0162";
231 case 'U':
232 return "\u00db";
233 case 'V':
234 return "\u03bd";
235 case 'W':
236 return "\u0174";
237 case 'X':
238 return "\u00d7";
239 case 'Y':
240 return "\u00dd";
241 case 'Z':
242 return "\u017d";
243 case '!':
244 return "\u00a1";
245 case '?':
246 return "\u00bf";
247 case '$':
248 return "\u20ac";
249 default:
250 return nullptr;
251 }
252 }
253
IsPossibleNormalPlaceholderEnd(const char c)254 static bool IsPossibleNormalPlaceholderEnd(const char c) {
255 switch (c) {
256 case 's':
257 return true;
258 case 'S':
259 return true;
260 case 'c':
261 return true;
262 case 'C':
263 return true;
264 case 'd':
265 return true;
266 case 'o':
267 return true;
268 case 'x':
269 return true;
270 case 'X':
271 return true;
272 case 'f':
273 return true;
274 case 'e':
275 return true;
276 case 'E':
277 return true;
278 case 'g':
279 return true;
280 case 'G':
281 return true;
282 case 'a':
283 return true;
284 case 'A':
285 return true;
286 case 'b':
287 return true;
288 case 'B':
289 return true;
290 case 'h':
291 return true;
292 case 'H':
293 return true;
294 case '%':
295 return true;
296 case 'n':
297 return true;
298 default:
299 return false;
300 }
301 }
302
PseudoGenerateExpansion(const unsigned int length)303 static std::string PseudoGenerateExpansion(const unsigned int length) {
304 std::string result = kExpansionString;
305 const char* s = result.data();
306 if (result.size() < length) {
307 result += " ";
308 result += PseudoGenerateExpansion(length - result.size());
309 } else {
310 int ext = 0;
311 // Should contain only whole words, so looking for a space
312 for (unsigned int i = length + 1; i < result.size(); ++i) {
313 ++ext;
314 if (s[i] == ' ') {
315 break;
316 }
317 }
318 result = result.substr(0, length + ext);
319 }
320 return result;
321 }
322
Start()323 std::string PseudoMethodAccent::Start() {
324 std::string result;
325 if (depth_ == 0) {
326 result = "[";
327 }
328 word_count_ = length_ = 0;
329 depth_++;
330 return result;
331 }
332
End()333 std::string PseudoMethodAccent::End() {
334 std::string result;
335 if (length_) {
336 result += " ";
337 result += PseudoGenerateExpansion(word_count_ > 3 ? length_ : length_ / 2);
338 }
339 word_count_ = length_ = 0;
340 depth_--;
341 if (depth_ == 0) {
342 result += "]";
343 }
344 return result;
345 }
346
347 /**
348 * Converts characters so they look like they've been localized.
349 *
350 * Note: This leaves placeholder syntax untouched.
351 */
Text(const StringPiece & source)352 std::string PseudoMethodAccent::Text(const StringPiece& source) {
353 const char* s = source.data();
354 std::string result;
355 const size_t I = source.size();
356 bool lastspace = true;
357 for (size_t i = 0; i < I; i++) {
358 char c = s[i];
359 if (c == '%') {
360 // Placeholder syntax, no need to pseudolocalize
361 std::string chunk;
362 bool end = false;
363 chunk.append(&c, 1);
364 while (!end && i + 1 < I) {
365 ++i;
366 c = s[i];
367 chunk.append(&c, 1);
368 if (IsPossibleNormalPlaceholderEnd(c)) {
369 end = true;
370 } else if (i + 1 < I && c == 't') {
371 ++i;
372 c = s[i];
373 chunk.append(&c, 1);
374 end = true;
375 }
376 }
377 // Treat chunk as a placeholder unless it ends with %.
378 result += ((c == '%') ? chunk : Placeholder(chunk));
379 } else if (c == '<' || c == '&') {
380 // html syntax, no need to pseudolocalize
381 bool tag_closed = false;
382 while (!tag_closed && i < I) {
383 if (c == '&') {
384 std::string escape_text;
385 escape_text.append(&c, 1);
386 bool end = false;
387 size_t html_code_pos = i;
388 while (!end && html_code_pos < I) {
389 ++html_code_pos;
390 c = s[html_code_pos];
391 escape_text.append(&c, 1);
392 // Valid html code
393 if (c == ';') {
394 end = true;
395 i = html_code_pos;
396 }
397 // Wrong html code
398 else if (!((c == '#' || (c >= 'a' && c <= 'z') ||
399 (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')))) {
400 end = true;
401 }
402 }
403 result += escape_text;
404 if (escape_text != "<") {
405 tag_closed = true;
406 }
407 continue;
408 }
409 if (c == '>') {
410 tag_closed = true;
411 result.append(&c, 1);
412 continue;
413 }
414 result.append(&c, 1);
415 i++;
416 c = s[i];
417 }
418 } else {
419 // This is a pure text that should be pseudolocalized
420 const char* p = PseudolocalizeChar(c);
421 if (p != nullptr) {
422 result += p;
423 } else {
424 bool space = isspace(c);
425 if (lastspace && !space) {
426 word_count_++;
427 }
428 lastspace = space;
429 result.append(&c, 1);
430 }
431 // Count only pseudolocalizable chars and delimiters
432 length_++;
433 }
434 }
435 return result;
436 }
437
Placeholder(const StringPiece & source)438 std::string PseudoMethodAccent::Placeholder(const StringPiece& source) {
439 // Surround a placeholder with brackets
440 return kPlaceholderOpen + source.to_string() + kPlaceholderClose;
441 }
442
Text(const StringPiece & source)443 std::string PseudoMethodBidi::Text(const StringPiece& source) {
444 const char* s = source.data();
445 std::string result;
446 bool lastspace = true;
447 bool space = true;
448 bool escape = false;
449 const char ESCAPE_CHAR = '\\';
450 for (size_t i = 0; i < source.size(); i++) {
451 char c = s[i];
452 if (!escape && c == ESCAPE_CHAR) {
453 escape = true;
454 continue;
455 }
456 space = (!escape && isspace(c)) || (escape && (c == 'n' || c == 't'));
457 if (lastspace && !space) {
458 // Word start
459 result += kRlm + kRlo;
460 } else if (!lastspace && space) {
461 // Word end
462 result += kPdf + kRlm;
463 }
464 lastspace = space;
465 if (escape) {
466 result.append(&ESCAPE_CHAR, 1);
467 escape=false;
468 }
469 result.append(&c, 1);
470 }
471 if (!lastspace) {
472 // End of last word
473 result += kPdf + kRlm;
474 }
475 return result;
476 }
477
Placeholder(const StringPiece & source)478 std::string PseudoMethodBidi::Placeholder(const StringPiece& source) {
479 // Surround a placeholder with directionality change sequence
480 return kRlm + kRlo + source.to_string() + kPdf + kRlm;
481 }
482
483 } // namespace aapt
484