tools/aapt/pseudolocalize.cpp

#include "pseudolocalize.h"

using namespace std;

// String basis to generate expansion
static const String16 k_expansion_string = String16("one two three "
    "four five six seven eight nine ten eleven twelve thirteen "
    "fourteen fiveteen sixteen seventeen nineteen twenty");

// Special unicode characters to override directionality of the words
static const String16 k_rlm = String16("\xe2\x80\x8f");
static const String16 k_rlo = String16("\xE2\x80\xae");
static const String16 k_pdf = String16("\xE2\x80\xac");

// Placeholder marks
static const String16 k_placeholder_open = String16("\xc2\xbb");
static const String16 k_placeholder_close = String16("\xc2\xab");

static const char16_t k_arg_start = '{';
static const char16_t k_arg_end = '}';

Pseudolocalizer::Pseudolocalizer(PseudolocalizationMethod m)
    : mImpl(nullptr), mLastDepth(0) {
  setMethod(m);
}

void Pseudolocalizer::setMethod(PseudolocalizationMethod m) {
  if (mImpl) {
    delete mImpl;
  }
  if (m == PSEUDO_ACCENTED) {
    mImpl = new PseudoMethodAccent();
  } else if (m == PSEUDO_BIDI) {
    mImpl = new PseudoMethodBidi();
  } else {
    mImpl = new PseudoMethodNone();
  }
}

String16 Pseudolocalizer::text(const String16& text) {
  String16 out;
  size_t depth = mLastDepth;
  size_t lastpos, pos;
  const size_t length= text.size();
  const char16_t* str = text.string();
  bool escaped = false;
  for (lastpos = pos = 0; pos < length; pos++) {
    char16_t c = str[pos];
    if (escaped) {
      escaped = false;
      continue;
    }
    if (c == '\'') {
      escaped = true;
      continue;
    }

    if (c == k_arg_start) {
      depth++;
    } else if (c == k_arg_end && depth) {
      depth--;
    }

    if (mLastDepth != depth || pos == length - 1) {
      bool pseudo = ((mLastDepth % 2) == 0);
      size_t nextpos = pos;
      if (!pseudo || depth == mLastDepth) {
        nextpos++;
      }
      size_t size = nextpos - lastpos;
      if (size) {
        String16 chunk = String16(text, size, lastpos);
        if (pseudo) {
          chunk = mImpl->text(chunk);
        } else if (str[lastpos] == k_arg_start &&
                   str[nextpos - 1] == k_arg_end) {
          chunk = mImpl->placeholder(chunk);
        }
        out.append(chunk);
      }
      if (pseudo && depth < mLastDepth) { // End of message
        out.append(mImpl->end());
      } else if (!pseudo && depth > mLastDepth) { // Start of message
        out.append(mImpl->start());
      }
      lastpos = nextpos;
      mLastDepth = depth;
    }
  }
  return out;
}

static const char*
pseudolocalize_char(const char16_t c)
{
    switch (c) {
        case 'a':   return "\xc3\xa5";
        case 'b':   return "\xc9\x93";
        case 'c':   return "\xc3\xa7";
        case 'd':   return "\xc3\xb0";
        case 'e':   return "\xc3\xa9";
        case 'f':   return "\xc6\x92";
        case 'g':   return "\xc4\x9d";
        case 'h':   return "\xc4\xa5";
        case 'i':   return "\xc3\xae";
        case 'j':   return "\xc4\xb5";
        case 'k':   return "\xc4\xb7";
        case 'l':   return "\xc4\xbc";
        case 'm':   return "\xe1\xb8\xbf";
        case 'n':   return "\xc3\xb1";
        case 'o':   return "\xc3\xb6";
        case 'p':   return "\xc3\xbe";
        case 'q':   return "\x51";
        case 'r':   return "\xc5\x95";
        case 's':   return "\xc5\xa1";
        case 't':   return "\xc5\xa3";
        case 'u':   return "\xc3\xbb";
        case 'v':   return "\x56";
        case 'w':   return "\xc5\xb5";
        case 'x':   return "\xd1\x85";
        case 'y':   return "\xc3\xbd";
        case 'z':   return "\xc5\xbe";
        case 'A':   return "\xc3\x85";
        case 'B':   return "\xce\xb2";
        case 'C':   return "\xc3\x87";
        case 'D':   return "\xc3\x90";
        case 'E':   return "\xc3\x89";
        case 'G':   return "\xc4\x9c";
        case 'H':   return "\xc4\xa4";
        case 'I':   return "\xc3\x8e";
        case 'J':   return "\xc4\xb4";
        case 'K':   return "\xc4\xb6";
        case 'L':   return "\xc4\xbb";
        case 'M':   return "\xe1\xb8\xbe";
        case 'N':   return "\xc3\x91";
        case 'O':   return "\xc3\x96";
        case 'P':   return "\xc3\x9e";
        case 'Q':   return "\x71";
        case 'R':   return "\xc5\x94";
        case 'S':   return "\xc5\xa0";
        case 'T':   return "\xc5\xa2";
        case 'U':   return "\xc3\x9b";
        case 'V':   return "\xce\xbd";
        case 'W':   return "\xc5\xb4";
        case 'X':   return "\xc3\x97";
        case 'Y':   return "\xc3\x9d";
        case 'Z':   return "\xc5\xbd";
        case '!':   return "\xc2\xa1";
        case '?':   return "\xc2\xbf";
        case '$':   return "\xe2\x82\xac";
        default:    return NULL;
    }
}

static bool is_possible_normal_placeholder_end(const char16_t c) {
    switch (c) {
        case 's': return true;
        case 'S': return true;
        case 'c': return true;
        case 'C': return true;
        case 'd': return true;
        case 'o': return true;
        case 'x': return true;
        case 'X': return true;
        case 'f': return true;
        case 'e': return true;
        case 'E': return true;
        case 'g': return true;
        case 'G': return true;
        case 'a': return true;
        case 'A': return true;
        case 'b': return true;
        case 'B': return true;
        case 'h': return true;
        case 'H': return true;
        case '%': return true;
        case 'n': return true;
        default:  return false;
    }
}

static String16 pseudo_generate_expansion(const unsigned int length) {
    String16 result = k_expansion_string;
    const char16_t* s = result.string();
    if (result.size() < length) {
        result += String16(" ");
        result += pseudo_generate_expansion(length - result.size());
    } else {
        int ext = 0;
        // Should contain only whole words, so looking for a space
        for (unsigned int i = length + 1; i < result.size(); ++i) {
          ++ext;
          if (s[i] == ' ') {
            break;
          }
        }
        // Just keep the first length + ext characters
        result = String16(result, length + ext);
    }
    return result;
}

static bool is_space(const char16_t c) {
  return (c == ' ' || c == '\t' || c == '\n');
}

String16 PseudoMethodAccent::start() {
  String16 result;
  if (mDepth == 0) {
    result = String16(String8("["));
  }
  mWordCount = mLength = 0;
  mDepth++;
  return result;
}

String16 PseudoMethodAccent::end() {
  String16 result;
  if (mLength) {
    result.append(String16(String8(" ")));
    result.append(pseudo_generate_expansion(
        mWordCount > 3 ? mLength : mLength / 2));
  }
  mWordCount = mLength = 0;
  mDepth--;
  if (mDepth == 0) {
    result.append(String16(String8("]")));
  }
  return result;
}

/**
 * Converts characters so they look like they've been localized.
 *
 * Note: This leaves escape sequences untouched so they can later be
 * processed by ResTable::collectString in the normal way.
 */
String16 PseudoMethodAccent::text(const String16& source)
{
    const char16_t* s = source.string();
    String16 result;
    const size_t I = source.size();
    bool lastspace = true;
    for (size_t i=0; i<I; i++) {
        char16_t c = s[i];
        if (c == '\\') {
            // Escape syntax, no need to pseudolocalize
            if (i<I-1) {
                result += String16("\\");
                i++;
                c = s[i];
                switch (c) {
                    case 'u':
                        // this one takes up 5 chars
                        result += String16(s+i, 5);
                        i += 4;
                        break;
                    case 't':
                    case 'n':
                    case '#':
                    case '@':
                    case '?':
                    case '"':
                    case '\'':
                    case '\\':
                    default:
                        result.append(&c, 1);
                        break;
                }
            } else {
                result.append(&c, 1);
            }
        } else if (c == '%') {
            // Placeholder syntax, no need to pseudolocalize
            String16 chunk;
            bool end = false;
            chunk.append(&c, 1);
            while (!end && i < I) {
                ++i;
                c = s[i];
                chunk.append(&c, 1);
                if (is_possible_normal_placeholder_end(c)) {
                    end = true;
                } else if (c == 't') {
                    ++i;
                    c = s[i];
                    chunk.append(&c, 1);
                    end = true;
                }
            }
            // Treat chunk as a placeholder unless it ends with %.
            result += ((c == '%') ? chunk : placeholder(chunk));
        } else if (c == '<' || c == '&') {
            // html syntax, no need to pseudolocalize
            bool tag_closed = false;
            while (!tag_closed && i < I) {
                if (c == '&') {
                    String16 escape_text;
                    escape_text.append(&c, 1);
                    bool end = false;
                    size_t htmlCodePos = i;
                    while (!end && htmlCodePos < I) {
                        ++htmlCodePos;
                        c = s[htmlCodePos];
                        escape_text.append(&c, 1);
                        // Valid html code
                        if (c == ';') {
                            end = true;
                            i = htmlCodePos;
                        }
                        // Wrong html code
                        else if (!((c == '#' ||
                                 (c >= 'a' && c <= 'z') ||
                                 (c >= 'A' && c <= 'Z') ||
                                 (c >= '0' && c <= '9')))) {
                            end = true;
                        }
                    }
                    result += escape_text;
                    if (escape_text != String16("&lt;")) {
                        tag_closed = true;
                    }
                    continue;
                }
                if (c == '>') {
                    tag_closed = true;
                    result.append(&c, 1);
                    continue;
                }
                result.append(&c, 1);
                i++;
                c = s[i];
            }
        } else {
            // This is a pure text that should be pseudolocalized
            const char* p = pseudolocalize_char(c);
            if (p != NULL) {
                result += String16(p);
            } else {
                bool space = is_space(c);
                if (lastspace && !space) {
                  mWordCount++;
                }
                lastspace = space;
                result.append(&c, 1);
            }
            // Count only pseudolocalizable chars and delimiters
            mLength++;
        }
    }
    return result;
}
String16 PseudoMethodAccent::placeholder(const String16& source) {
  // Surround a placeholder with brackets
  return k_placeholder_open + source + k_placeholder_close;
}

String16 PseudoMethodBidi::text(const String16& source)
{
    const char16_t* s = source.string();
    String16 result;
    bool lastspace = true;
    bool space = true;
    bool escape = false;
    const char16_t ESCAPE_CHAR = '\\';
    for (size_t i=0; i<source.size(); i++) {
        char16_t c = s[i];
        if (!escape && c == ESCAPE_CHAR) {
          escape = true;
          continue;
        }
        space = (!escape && is_space(c)) || (escape && (c == 'n' || c == 't'));
        if (lastspace && !space) {
          // Word start
          result += k_rlm + k_rlo;
        } else if (!lastspace && space) {
          // Word end
          result += k_pdf + k_rlm;
        }
        lastspace = space;
        if (escape) {
          result.append(&ESCAPE_CHAR, 1);
          escape=false;
        }
        result.append(&c, 1);
    }
    if (!lastspace) {
      // End of last word
      result += k_pdf + k_rlm;
    }
    return result;
}

String16 PseudoMethodBidi::placeholder(const String16& source) {
  // Surround a placeholder with directionality change sequence
  return k_rlm + k_rlo + source + k_pdf + k_rlm;
}