/* * Copyright (c) 2023 Huawei Device Co., Ltd. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "namemangler.h" #include #include #include namespace namemangler { #ifdef __MRT_DEBUG #define DEBUG_ASSERT(f) assert(f) #else #define DEBUG_ASSERT(f) ((void)0) #endif const int KLOCAL_CODE_BUF_SIZE = 1024; const int KMAX_CODEC_BUF_SIZE = (1 << 16); #define GETHEXCHAR(n) static_cast((n) < 10 ? (n) + '0' : (n) - 10 + 'a') #define GETHEXCHARU(n) static_cast((n) < 10 ? (n) + '0' : (n) - 10 + 'A') using StringMap = std::map; // The returned buffer needs to be explicitly freed static inline char *AllocCodecBuf(size_t maxLen) { if (maxLen == 0) { return nullptr; } // each char may have 2 more char, so give out the max space buffer constexpr int multi = 3; return reinterpret_cast( malloc((maxLen <= KLOCAL_CODE_BUF_SIZE) ? multi * maxLen : multi * KMAX_CODEC_BUF_SIZE)); } static inline void FreeCodecBuf(char *buf) { free(buf); } std::string EncodeName(const std::string &name) { // name is guaranteed to be null-terminated size_t nameLen = name.length(); nameLen = nameLen > KMAX_CODEC_BUF_SIZE ? KMAX_CODEC_BUF_SIZE : nameLen; char *buf = AllocCodecBuf(nameLen); if (buf == nullptr) { return std::string(name); } size_t pos = 0; size_t i = 0; std::string str(name); std::u16string str16; while (i < nameLen) { unsigned char c = static_cast(name[i]); if (c == '_') { buf[pos++] = '_'; buf[pos++] = '_'; } else if (c == '[') { buf[pos++] = 'A'; } else if (isalnum(c)) { buf[pos++] = static_cast(c); } else if (c <= 0x7F) { // _XX: '_' followed by ascii code in hex if (c == '.') { c = '/'; // use / in package name } buf[pos++] = '_'; unsigned char n = c >> 4; // get the high 4 bit and calculate buf[pos++] = GETHEXCHARU(n); n = static_cast(c - static_cast(n << 4)); // revert the high 4 bit buf[pos++] = GETHEXCHARU(n); } else { str16.clear(); // process one 16-bit char at a time unsigned int n = UTF8ToUTF16(str16, str.substr(i), 1, false); buf[pos++] = '_'; if ((n >> 16) == 1) { // if n is 16-bit unsigned short m = str16[0]; buf[pos++] = 'u'; buf[pos++] = GETHEXCHAR((m & 0xF000) >> 12); buf[pos++] = GETHEXCHAR((m & 0x0F00) >> 8); buf[pos++] = GETHEXCHAR((m & 0x00F0) >> 4); buf[pos++] = GETHEXCHAR(m & 0x000F); } else { unsigned short m = str16[0]; buf[pos++] = 'U'; buf[pos++] = GETHEXCHAR((m & 0xF000) >> 12); buf[pos++] = GETHEXCHAR((m & 0x0F00) >> 8); buf[pos++] = GETHEXCHAR((m & 0x00F0) >> 4); buf[pos++] = GETHEXCHAR(m & 0x000F); m = str16[1]; buf[pos++] = GETHEXCHAR((m & 0xF000) >> 12); buf[pos++] = GETHEXCHAR((m & 0x0F00) >> 8); buf[pos++] = GETHEXCHAR((m & 0x00F0) >> 4); buf[pos++] = GETHEXCHAR(m & 0x000F); } i += static_cast(int32_t(n & 0xFFFF) - 1); } i++; } buf[pos] = '\0'; std::string newName = std::string(buf, pos); FreeCodecBuf(buf); return newName; } static inline bool UpdatePrimType(bool primType, int splitNo, uint32_t ch) { if (ch == 'L') { return false; } if (((ch == ';') || (ch == '(') || (ch == ')')) && (splitNo > 1)) { return true; } return primType; } namespace { constexpr int kNumLimit = 10; constexpr int kCodeOffset3 = 12; constexpr int kCodeOffset2 = 8; constexpr int kCodeOffset = 4; constexpr size_t k64BitShift = 6; // 64 is 1 << 6 } std::string DecodeName(const std::string &name) { if (name.find(';') != std::string::npos) { // no need Decoding a non-encoded string return name; } std::string decompressedName; const char *namePtr = nullptr; size_t nameLen; namePtr = name.c_str(); nameLen = name.length(); // Demangled name is supposed to be shorter. No buffer overflow issue here. std::string newName(nameLen, '\0'); bool primType = true; int splitNo = 0; // split: class 0 | method 1 | signature 2 size_t pos = 0; std::string str; std::u16string str16; for (size_t i = 0; i < nameLen;) { unsigned char c = static_cast(namePtr[i]); ++i; if (c == '_') { // _XX: '_' followed by ascii code in hex if (i >= nameLen) { break; } if (namePtr[i] == '_') { newName[pos++] = namePtr[i++]; } else if (namePtr[i] == 'u') { str.clear(); str16.clear(); i++; c = static_cast(namePtr[i++]); uint8_t b1 = (c <= '9') ? c - '0' : c - 'a' + kNumLimit; c = static_cast(namePtr[i++]); uint8_t b2 = (c <= '9') ? c - '0' : c - 'a' + kNumLimit; c = static_cast(namePtr[i++]); uint8_t b3 = (c <= '9') ? c - '0' : c - 'a' + kNumLimit; c = static_cast(namePtr[i++]); uint8_t b4 = (c <= '9') ? c - '0' : c - 'a' + kNumLimit; uint32_t codepoint = (b1 << kCodeOffset3) | (b2 << kCodeOffset2) | (b3 << kCodeOffset) | b4; str16 += static_cast(codepoint); unsigned int count = UTF16ToUTF8(str, str16, 1, false) >> 16; // shift 16 to get count if (count == 2) { // the count of str equal 2 to 4, use array to save the utf8 newName[pos++] = str[0]; newName[pos++] = str[1]; } else if (count == 3) { // the count of str equal 2 to 4, deal 3 new newName[pos++] = str[0]; newName[pos++] = str[1]; newName[pos++] = str[2]; // 2 is index of third char } else if (count == 4) { // the count of str equal 2 to 4 newName[pos++] = str[0]; newName[pos++] = str[1]; newName[pos++] = str[2]; // 2 is index of third char newName[pos++] = str[3]; // 3 is index of fourth char } } else { c = static_cast(namePtr[i++]); unsigned int v = (c <= '9') ? c - '0' : c - 'A' + kNumLimit; unsigned int asc = v << kCodeOffset; if (i >= nameLen) { break; } c = static_cast(namePtr[i++]); v = (c <= '9') ? c - '0' : c - 'A' + kNumLimit; asc += v; newName[pos++] = static_cast(asc); if (asc == '|') { splitNo++; } primType = UpdatePrimType(primType, splitNo, asc); } } else { if (splitNo < 2) { // split: class 0 | method 1 | signature 2 newName[pos++] = static_cast(c); continue; } primType = UpdatePrimType(primType, splitNo, c); if (primType) { newName[pos++] = (c == 'A') ? '[' : c; } else { newName[pos++] = static_cast(c); } } } newName.resize(pos); return newName; } // input: maple name // output: Lj/lang/Object; [Lj/lang/Object; void DecodeMapleNameToJDescriptor(const std::string &nameIn, std::string &nameOut) { nameOut = DecodeName(nameIn); if (nameOut[0] == 'A') { size_t i = 0; while (nameOut[i] == 'A') { nameOut[i++] = '['; } } } static uint16_t ChangeEndian16(uint16_t u16) { return ((u16 & 0xFF00) >> kCodeOffset2) | ((u16 & 0xFF) << kCodeOffset2); } /* UTF8 * U+0000 - U+007F 0xxxxxxx * U+0080 - U+07FF 110xxxxx 10xxxxxx * U+0800 - U+FFFF 1110xxxx 10xxxxxx 10xxxxxx * U+10000- U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * * UTF16 * U+0000 - U+D7FF codePoint * U+E000 - U+FFFF codePoint * U+10000- U+10FFFF XXXX YYYY * code = codePoint - 0x010000, ie, 20-bit number in the range 0x000000..0x0FFFFF * XXXX: top 10 bits of code + 0xD800: 0xD800..0xDBFF * YYYY: low 10 bits of code + 0xDC00: 0xDC00..0xDFFF * * convert upto num UTF8 elements * return two 16-bit values: return_number_of_elements | consumed_input_number_of_elements */ const int kCodepointOffset1 = 6; // U+0080 - U+07FF 110xxxxx 10xxxxxx const int kCodepointOffset2 = 12; // U+0800 - U+FFFF 1110xxxx 10xxxxxx 10xxxxxx const int kCodepointOffset3 = 18; // U+10000- U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx const int kCountOffset = 16; const int kCodeAfterMinusOffset = 10; // codePoint equals itself minus 0x10000 unsigned UTF16ToUTF8(std::string &str, const std::u16string &str16, unsigned short num, bool isBigEndian) { uint32_t codePoint = 0; uint32_t i = 0; unsigned short count = 0; unsigned short retNum = 0; while (i < str16.length()) { if (isBigEndian || num == 1) { codePoint = str16[i++]; } else { codePoint = ChangeEndian16(str16[i++]); } if (codePoint > 0xFFFF) { codePoint &= 0x3FF; codePoint <<= kNumLimit; if (isBigEndian) { codePoint += str16[i++] & 0x3FF; } else { codePoint += ChangeEndian16(str16[i++]) & 0x3FF; } } if (codePoint <= 0x7F) { str += static_cast(codePoint); retNum += 1; // 1 UTF8 char } else if (codePoint <= 0x7FF) { str += static_cast(0xC0 + (codePoint >> kCodepointOffset1)); str += static_cast(0x80 + (codePoint & 0x3F)); retNum += 2; // 2 UTF8 chars } else if (codePoint <= 0xFFFF) { str += static_cast(0xE0 + ((codePoint >> kCodepointOffset2) & 0xF)); str += static_cast(0x80 + ((codePoint >> kCodepointOffset1) & 0x3F)); str += static_cast(0x80 + (codePoint & 0x3F)); retNum += 3; // 3 UTF8 chars } else { str += static_cast(0xF0 + ((codePoint >> kCodepointOffset3) & 0x7)); str += static_cast(0x80 + ((codePoint >> kCodepointOffset2) & 0x3F)); str += static_cast(0x80 + ((codePoint >> kCodepointOffset1) & 0x3F)); str += static_cast(0x80 + (codePoint & 0x3F)); retNum += 4; // 4 UTF8 chars } count++; if (num == count) { return ((static_cast(retNum)) << kCountOffset) | static_cast(i); } } return i; } bool NeedConvertUTF16(const std::string &str8) { uint32_t a = 0; size_t i = 0; size_t size = str8.length(); while (i < size) { a = static_cast(str8[i++]); constexpr uint8_t maxValidAscii = 0x7F; if (a > maxValidAscii) { return true; } } return false; } uint32_t GetCodePoint(const std::string &str8, uint32_t &i) { uint32_t b; uint32_t c; uint32_t d; uint32_t codePoint = 0; uint32_t a = static_cast(str8[i++]); if (a <= 0x7F) { // 0... codePoint = a; } else if (a >= 0xF0) { // 11110... b = static_cast(str8[i++]); c = static_cast(str8[i++]); d = static_cast(str8[i++]); codePoint = ((a & 0x7) << kCodepointOffset3) | ((b & 0x3F) << kCodepointOffset2) | ((c & 0x3F) << kCodepointOffset1) | (d & 0x3F); } else if (a >= 0xE0) { // 1110... b = static_cast(str8[i++]); c = static_cast(str8[i++]); codePoint = ((a & 0xF) << kCodepointOffset2) | ((b & 0x3F) << kCodepointOffset1) | (c & 0x3F); } else if (a >= 0xC0) { // 110... b = static_cast(str8[i++]); codePoint = ((a & 0x1F) << kCodepointOffset1) | (b & 0x3F); } else { DEBUG_ASSERT(false && "invalid UTF-8"); } return codePoint; } // convert upto num UTF16 elements // two 16-bit values: return_number_of_elements | consumed_input_number_of_elements unsigned UTF8ToUTF16(std::u16string &str16, const std::string &str8, unsigned short num, bool isBigEndian) { uint32_t i = 0; unsigned short count = 0; unsigned short retNum = 0; while (i < str8.length()) { uint32_t codePoint = GetCodePoint(str8, i); if (codePoint <= 0xFFFF) { if (isBigEndian || num == 1) { str16 += static_cast(codePoint); } else { str16 += static_cast(ChangeEndian16(static_cast(codePoint))); } retNum += 1; // 1 utf16 } else { codePoint -= 0x10000; if (isBigEndian || num == 1) { str16 += static_cast((codePoint >> kCodeAfterMinusOffset) | 0xD800); str16 += static_cast((codePoint & 0x3FF) | 0xDC00); } else { str16 += static_cast( ChangeEndian16(static_cast((codePoint >> kCodeAfterMinusOffset) | 0xD800))); str16 += static_cast(ChangeEndian16((codePoint & 0x3FF) | 0xDC00)); } retNum += 2; // 2 utf16 } count++; // only convert num elmements if (num == count) { return (static_cast(retNum) << kCountOffset) | static_cast(i); } } return i; } const uint32_t kGreybackOffset = 7; void GetUnsignedLeb128Encode(std::vector &dest, uint32_t value) { bool done = false; do { uint8_t byte = value & 0x7f; value >>= kGreybackOffset; done = (value == 0); if (!done) { byte |= 0x80; } dest.push_back(byte); } while (!done); } uint32_t GetUnsignedLeb128Decode(const uint8_t **data) { DEBUG_ASSERT(data != nullptr && "data in GetUnsignedLeb128Decode() is nullptr"); const uint8_t *ptr = *data; uint32_t result = 0; uint32_t shift = 0; uint8_t byte = 0; while (true) { byte = *(ptr++); result |= (byte & 0x7f) << shift; if ((byte & 0x80) == 0) { break; } shift += kGreybackOffset; } *data = ptr; return result; } uint64_t GetLEB128Encode(int64_t val, bool isUnsigned) { uint64_t res = 0; uint8_t byte = 0; uint8_t count = 0; bool done = false; do { byte = static_cast(val) & 0x7f; val >>= kGreybackOffset; // intended signed shift: block codedex here done = (isUnsigned ? val == 0 : (val == 0 || val == -1)); if (!done) { byte |= 0x80; } res |= (static_cast(byte) << (count++ << 3)); // each byte need 8 bit (left shift 3) } while (!done); return res; } uint64_t GetUleb128Encode(uint64_t val) { return GetLEB128Encode(int64_t(val), true); } uint64_t GetSleb128Encode(int64_t val) { return GetLEB128Encode(val, false); } uint64_t GetUleb128Decode(uint64_t val) { return val; } int64_t GetSleb128Decode(uint64_t val) { return static_cast(val); } size_t GetUleb128Size(uint64_t v) { DEBUG_ASSERT(v && "if v == 0, __builtin_clzll(v) is not defined"); size_t clz = static_cast(__builtin_clzll(v)); // num of 7-bit groups, (64 - clz + 6) / 7 return size_t((64 - clz + 6) / 7); } size_t GetSleb128Size(int32_t v) { size_t size = 0; int rem = v >> kGreybackOffset; bool hasMore = true; int end = ((v >= 0) ? 0 : -1); while (hasMore) { hasMore = (rem != end) || ((rem & 1) != ((v >> k64BitShift) & 1)); // judege whether has More valid rem size++; v = rem; rem >>= static_cast(kGreybackOffset); // intended signed shift: block codedex here } return size; } } // namespace namemangler