• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include "pseudolocalize.h"
2 
3 using namespace std;
4 
5 // String basis to generate expansion
6 static const String16 k_expansion_string = String16("one two three "
7     "four five six seven eight nine ten eleven twelve thirteen "
8     "fourteen fiveteen sixteen seventeen nineteen twenty");
9 
10 // Special unicode characters to override directionality of the words
11 static const String16 k_rlm = String16("\xe2\x80\x8f");
12 static const String16 k_rlo = String16("\xE2\x80\xae");
13 static const String16 k_pdf = String16("\xE2\x80\xac");
14 
15 // Placeholder marks
16 static const String16 k_placeholder_open = String16("\xc2\xbb");
17 static const String16 k_placeholder_close = String16("\xc2\xab");
18 
19 static const char16_t k_arg_start = '{';
20 static const char16_t k_arg_end = '}';
21 
Pseudolocalizer(PseudolocalizationMethod m)22 Pseudolocalizer::Pseudolocalizer(PseudolocalizationMethod m)
23     : mImpl(nullptr), mLastDepth(0) {
24   setMethod(m);
25 }
26 
setMethod(PseudolocalizationMethod m)27 void Pseudolocalizer::setMethod(PseudolocalizationMethod m) {
28   if (mImpl) {
29     delete mImpl;
30   }
31   if (m == PSEUDO_ACCENTED) {
32     mImpl = new PseudoMethodAccent();
33   } else if (m == PSEUDO_BIDI) {
34     mImpl = new PseudoMethodBidi();
35   } else {
36     mImpl = new PseudoMethodNone();
37   }
38 }
39 
text(const String16 & text)40 String16 Pseudolocalizer::text(const String16& text) {
41   String16 out;
42   size_t depth = mLastDepth;
43   size_t lastpos, pos;
44   const size_t length= text.size();
45   const char16_t* str = text.c_str();
46   bool escaped = false;
47   for (lastpos = pos = 0; pos < length; pos++) {
48     char16_t c = str[pos];
49     if (escaped) {
50       escaped = false;
51       continue;
52     }
53     if (c == '\'') {
54       escaped = true;
55       continue;
56     }
57 
58     if (c == k_arg_start) {
59       depth++;
60     } else if (c == k_arg_end && depth) {
61       depth--;
62     }
63 
64     if (mLastDepth != depth || pos == length - 1) {
65       bool pseudo = ((mLastDepth % 2) == 0);
66       size_t nextpos = pos;
67       if (!pseudo || depth == mLastDepth) {
68         nextpos++;
69       }
70       size_t size = nextpos - lastpos;
71       if (size) {
72         String16 chunk = String16(text, size, lastpos);
73         if (pseudo) {
74           chunk = mImpl->text(chunk);
75         } else if (str[lastpos] == k_arg_start &&
76                    str[nextpos - 1] == k_arg_end) {
77           chunk = mImpl->placeholder(chunk);
78         }
79         out.append(chunk);
80       }
81       if (pseudo && depth < mLastDepth) { // End of message
82         out.append(mImpl->end());
83       } else if (!pseudo && depth > mLastDepth) { // Start of message
84         out.append(mImpl->start());
85       }
86       lastpos = nextpos;
87       mLastDepth = depth;
88     }
89   }
90   return out;
91 }
92 
93 static const char*
pseudolocalize_char(const char16_t c)94 pseudolocalize_char(const char16_t c)
95 {
96     switch (c) {
97         case 'a':   return "\xc3\xa5";
98         case 'b':   return "\xc9\x93";
99         case 'c':   return "\xc3\xa7";
100         case 'd':   return "\xc3\xb0";
101         case 'e':   return "\xc3\xa9";
102         case 'f':   return "\xc6\x92";
103         case 'g':   return "\xc4\x9d";
104         case 'h':   return "\xc4\xa5";
105         case 'i':   return "\xc3\xae";
106         case 'j':   return "\xc4\xb5";
107         case 'k':   return "\xc4\xb7";
108         case 'l':   return "\xc4\xbc";
109         case 'm':   return "\xe1\xb8\xbf";
110         case 'n':   return "\xc3\xb1";
111         case 'o':   return "\xc3\xb6";
112         case 'p':   return "\xc3\xbe";
113         case 'q':   return "\x51";
114         case 'r':   return "\xc5\x95";
115         case 's':   return "\xc5\xa1";
116         case 't':   return "\xc5\xa3";
117         case 'u':   return "\xc3\xbb";
118         case 'v':   return "\x56";
119         case 'w':   return "\xc5\xb5";
120         case 'x':   return "\xd1\x85";
121         case 'y':   return "\xc3\xbd";
122         case 'z':   return "\xc5\xbe";
123         case 'A':   return "\xc3\x85";
124         case 'B':   return "\xce\xb2";
125         case 'C':   return "\xc3\x87";
126         case 'D':   return "\xc3\x90";
127         case 'E':   return "\xc3\x89";
128         case 'G':   return "\xc4\x9c";
129         case 'H':   return "\xc4\xa4";
130         case 'I':   return "\xc3\x8e";
131         case 'J':   return "\xc4\xb4";
132         case 'K':   return "\xc4\xb6";
133         case 'L':   return "\xc4\xbb";
134         case 'M':   return "\xe1\xb8\xbe";
135         case 'N':   return "\xc3\x91";
136         case 'O':   return "\xc3\x96";
137         case 'P':   return "\xc3\x9e";
138         case 'Q':   return "\x71";
139         case 'R':   return "\xc5\x94";
140         case 'S':   return "\xc5\xa0";
141         case 'T':   return "\xc5\xa2";
142         case 'U':   return "\xc3\x9b";
143         case 'V':   return "\xce\xbd";
144         case 'W':   return "\xc5\xb4";
145         case 'X':   return "\xc3\x97";
146         case 'Y':   return "\xc3\x9d";
147         case 'Z':   return "\xc5\xbd";
148         case '!':   return "\xc2\xa1";
149         case '?':   return "\xc2\xbf";
150         case '$':   return "\xe2\x82\xac";
151         default:    return NULL;
152     }
153 }
154 
is_possible_normal_placeholder_end(const char16_t c)155 static bool is_possible_normal_placeholder_end(const char16_t c) {
156     switch (c) {
157         case 's': return true;
158         case 'S': return true;
159         case 'c': return true;
160         case 'C': return true;
161         case 'd': return true;
162         case 'o': return true;
163         case 'x': return true;
164         case 'X': return true;
165         case 'f': return true;
166         case 'e': return true;
167         case 'E': return true;
168         case 'g': return true;
169         case 'G': return true;
170         case 'a': return true;
171         case 'A': return true;
172         case 'b': return true;
173         case 'B': return true;
174         case 'h': return true;
175         case 'H': return true;
176         case '%': return true;
177         case 'n': return true;
178         default:  return false;
179     }
180 }
181 
pseudo_generate_expansion(const unsigned int length)182 static String16 pseudo_generate_expansion(const unsigned int length) {
183     String16 result = k_expansion_string;
184     const char16_t* s = result.c_str();
185     if (result.size() < length) {
186         result += String16(" ");
187         result += pseudo_generate_expansion(length - result.size());
188     } else {
189         int ext = 0;
190         // Should contain only whole words, so looking for a space
191         for (unsigned int i = length + 1; i < result.size(); ++i) {
192           ++ext;
193           if (s[i] == ' ') {
194             break;
195           }
196         }
197         // Just keep the first length + ext characters
198         result = String16(result, length + ext);
199     }
200     return result;
201 }
202 
is_space(const char16_t c)203 static bool is_space(const char16_t c) {
204   return (c == ' ' || c == '\t' || c == '\n');
205 }
206 
start()207 String16 PseudoMethodAccent::start() {
208   String16 result;
209   if (mDepth == 0) {
210     result = String16(String8("["));
211   }
212   mWordCount = mLength = 0;
213   mDepth++;
214   return result;
215 }
216 
end()217 String16 PseudoMethodAccent::end() {
218   String16 result;
219   if (mLength) {
220     result.append(String16(String8(" ")));
221     result.append(pseudo_generate_expansion(
222         mWordCount > 3 ? mLength : mLength / 2));
223   }
224   mWordCount = mLength = 0;
225   mDepth--;
226   if (mDepth == 0) {
227     result.append(String16(String8("]")));
228   }
229   return result;
230 }
231 
232 /**
233  * Converts characters so they look like they've been localized.
234  *
235  * Note: This leaves escape sequences untouched so they can later be
236  * processed by ResTable::collectString in the normal way.
237  */
text(const String16 & source)238 String16 PseudoMethodAccent::text(const String16& source)
239 {
240     const char16_t* s = source.c_str();
241     String16 result;
242     const size_t I = source.size();
243     bool lastspace = true;
244     for (size_t i=0; i<I; i++) {
245         char16_t c = s[i];
246         if (c == '\\') {
247             // Escape syntax, no need to pseudolocalize
248             if (i<I-1) {
249                 result += String16("\\");
250                 i++;
251                 c = s[i];
252                 switch (c) {
253                     case 'u':
254                         // this one takes up 5 chars
255                         result += String16(s+i, 5);
256                         i += 4;
257                         break;
258                     case 't':
259                     case 'n':
260                     case '#':
261                     case '@':
262                     case '?':
263                     case '"':
264                     case '\'':
265                     case '\\':
266                     default:
267                         result.append(&c, 1);
268                         break;
269                 }
270             } else {
271                 result.append(&c, 1);
272             }
273         } else if (c == '%') {
274             // Placeholder syntax, no need to pseudolocalize
275             String16 chunk;
276             bool end = false;
277             chunk.append(&c, 1);
278             while (!end && i < I) {
279                 ++i;
280                 c = s[i];
281                 chunk.append(&c, 1);
282                 if (is_possible_normal_placeholder_end(c)) {
283                     end = true;
284                 } else if (c == 't') {
285                     ++i;
286                     c = s[i];
287                     chunk.append(&c, 1);
288                     end = true;
289                 }
290             }
291             // Treat chunk as a placeholder unless it ends with %.
292             result += ((c == '%') ? chunk : placeholder(chunk));
293         } else if (c == '<' || c == '&') {
294             // html syntax, no need to pseudolocalize
295             bool tag_closed = false;
296             while (!tag_closed && i < I) {
297                 if (c == '&') {
298                     String16 escape_text;
299                     escape_text.append(&c, 1);
300                     bool end = false;
301                     size_t htmlCodePos = i;
302                     while (!end && htmlCodePos < I) {
303                         ++htmlCodePos;
304                         c = s[htmlCodePos];
305                         escape_text.append(&c, 1);
306                         // Valid html code
307                         if (c == ';') {
308                             end = true;
309                             i = htmlCodePos;
310                         }
311                         // Wrong html code
312                         else if (!((c == '#' ||
313                                  (c >= 'a' && c <= 'z') ||
314                                  (c >= 'A' && c <= 'Z') ||
315                                  (c >= '0' && c <= '9')))) {
316                             end = true;
317                         }
318                     }
319                     result += escape_text;
320                     if (escape_text != String16("&lt;")) {
321                         tag_closed = true;
322                     }
323                     continue;
324                 }
325                 if (c == '>') {
326                     tag_closed = true;
327                     result.append(&c, 1);
328                     continue;
329                 }
330                 result.append(&c, 1);
331                 i++;
332                 c = s[i];
333             }
334         } else {
335             // This is a pure text that should be pseudolocalized
336             const char* p = pseudolocalize_char(c);
337             if (p != NULL) {
338                 result += String16(p);
339             } else {
340                 bool space = is_space(c);
341                 if (lastspace && !space) {
342                   mWordCount++;
343                 }
344                 lastspace = space;
345                 result.append(&c, 1);
346             }
347             // Count only pseudolocalizable chars and delimiters
348             mLength++;
349         }
350     }
351     return result;
352 }
placeholder(const String16 & source)353 String16 PseudoMethodAccent::placeholder(const String16& source) {
354   // Surround a placeholder with brackets
355   return k_placeholder_open + source + k_placeholder_close;
356 }
357 
text(const String16 & source)358 String16 PseudoMethodBidi::text(const String16& source)
359 {
360     const char16_t* s = source.c_str();
361     String16 result;
362     bool lastspace = true;
363     bool space = true;
364     bool escape = false;
365     const char16_t ESCAPE_CHAR = '\\';
366     for (size_t i=0; i<source.size(); i++) {
367         char16_t c = s[i];
368         if (!escape && c == ESCAPE_CHAR) {
369           escape = true;
370           continue;
371         }
372         space = (!escape && is_space(c)) || (escape && (c == 'n' || c == 't'));
373         if (lastspace && !space) {
374           // Word start
375           result += k_rlm + k_rlo;
376         } else if (!lastspace && space) {
377           // Word end
378           result += k_pdf + k_rlm;
379         }
380         lastspace = space;
381         if (escape) {
382           result.append(&ESCAPE_CHAR, 1);
383           escape=false;
384         }
385         result.append(&c, 1);
386     }
387     if (!lastspace) {
388       // End of last word
389       result += k_pdf + k_rlm;
390     }
391     return result;
392 }
393 
placeholder(const String16 & source)394 String16 PseudoMethodBidi::placeholder(const String16& source) {
395   // Surround a placeholder with directionality change sequence
396   return k_rlm + k_rlo + source + k_pdf + k_rlm;
397 }
398 
399