ecmascript/base/utf_helper.cpp

/*
 * Copyright (c) 2021 Huawei Device Co., Ltd.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "ecmascript/base/utf_helper.h"

#include "ecmascript/log_wrapper.h"

// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define U16_GET_SUPPLEMENTARY(lead, trail) \
    ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)

namespace panda::ecmascript::base::utf_helper {
uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
{
    ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
           (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
    uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
    return cp;
}

bool IsUTF16HighSurrogate(uint16_t ch)
{
    return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
}

bool IsUTF16LowSurrogate(uint16_t ch)
{
    return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
}

// Methods for decode utf16 to unicode
uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index)
{
    uint16_t high = utf16[*index];
    if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) {
        return high;
    }
    uint16_t low = utf16[*index + 1];
    if (!IsUTF16LowSurrogate(low)) {
        return high;
    }
    (*index)++;
    return ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
}

uint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index)
{
    uint16_t first = utf16[*index];
    // A valid surrogate pair should always start with a High Surrogate
    if (IsUTF16LowSurrogate(first)) {
        return UTF16_REPLACEMENT_CHARACTER;
    }
    if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
        if (*index == len - 1) {
            // A High surrogate not paired with another surrogate
            return UTF16_REPLACEMENT_CHARACTER;
        }
        uint16_t second = utf16[*index + 1];
        if (!IsUTF16LowSurrogate(second)) {
            // A High surrogate not followed by a low surrogate
            return UTF16_REPLACEMENT_CHARACTER;
        }
        // A valid surrogate pair, decode normally
        (*index)++;
        return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
    }
    // A unicode not fallen into the range of representing by surrogate pair, return as it is
    return first;
}

inline size_t UTF8Length(uint32_t codepoint)
{
    if (codepoint <= UTF8_1B_MAX) {
        return UtfLength::ONE;
    }
    if (codepoint <= UTF8_2B_MAX) {
        return UtfLength::TWO;
    }
    if (codepoint <= UTF8_3B_MAX) {
        return UtfLength::THREE;
    }
    return UtfLength::FOUR;
}

// Methods for encode unicode to unicode
size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index)
{
    size_t size = UTF8Length(codepoint);
    if (index + size > len) {
        return 0;
    }
    for (size_t j = size - 1; j > 0; j--) {
        uint8_t cont = ((codepoint | byteMark) & byteMask);
        utf8[index + j] = cont;
        codepoint >>= UTF8_OFFSET;
    }
    utf8[index] = codepoint | firstByteMark[size];
    return size;
}

bool IsValidUTF8(const std::vector<uint8_t> &data)
{
    uint32_t length = data.size();
    switch (length) {
        case UtfLength::ONE:
            if (data.at(0) >= BIT_MASK_1) {
                return false;
            }
            break;
        case UtfLength::TWO:
            if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
                return false;
            }
            if (data.at(0) < UTF8_2B_FIRST_MIN) {
                return false;
            }
            break;
        case UtfLength::THREE:
            if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
                return false;
            }
            if (data.at(0) == UTF8_3B_FIRST && data.at(1) < UTF8_3B_SECOND_MIN) {
                return false;
            }
            break;
        case UtfLength::FOUR:
            if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
                return false;
            }
            if (data.at(0) == UTF8_4B_FIRST && data.at(1) < UTF8_4B_SECOND_MIN) {
                return false;
            }
            break;
        default:
            LOG_ECMA(FATAL) << "this branch is unreachable";
            UNREACHABLE();
            break;
    }

    for (uint32_t i = 1; i < length; i++) {
        if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
            return false;
        }
    }
    return true;
}

Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer)
{
    // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
    // means that is a single code point, it needs to be represented by three UTF8 code.
    if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) {
        auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
        auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
        auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
        return {UtfLength::THREE, {ch0, ch1, ch2}};
    }

    if (d0 == 0) {
        if (isWriteBuffer) {
            return {1, {0x00U}};
        }
        if (modify) {
            // special case for \u0000 ==> C080 - 1100'0000 1000'0000
            return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
        }
        // For print string, just skip '\u0000'
        return {0, {0x00U}};
    }
    if (d0 <= UTF8_1B_MAX) {
        return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
    }
    if (d0 <= UTF8_2B_MAX) {
        auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
        auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT));
        return {UtfLength::TWO, {ch0, ch1}};
    }
    if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) {
        auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
        auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
        auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
        return {UtfLength::THREE, {ch0, ch1, ch2}};
    }
    if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) {
        // Bad sequence
        LOG_ECMA(FATAL) << "this branch is unreachable";
        UNREACHABLE();
    }

    uint32_t codePoint = CombineTwoU16(d0, d1);

    auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
    auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1);
    auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1);
    auto ch3 = static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1);
    return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
}

size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)
{
    size_t res = 1;  // zero byte
    // when utf16 data length is only 1 and code in 0xd800-0xdfff,
    // means that is a single code point, it needs to be represented by three UTF8 code.
    if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
        utf16[0] <= utf::LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
        res += UtfLength::THREE;
        return res;
    }

    for (uint32_t i = 0; i < length; ++i) {
        if (utf16[i] == 0) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
            if (modify) {
                res += UtfLength::TWO;  // special case for U+0000 => C0 80
            }
        } else if (utf16[i] <= UTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
            res += 1;
        } else if (utf16[i] <= UTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
            res += UtfLength::TWO;
            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
        } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) {
            res += UtfLength::THREE;
        } else {
            if (i < length - 1 &&
                utf16[i + 1] >= utf::LO_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
                utf16[i + 1] <= utf::LO_SURROGATE_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
                res += UtfLength::FOUR;
                ++i;
            } else {
                res += UtfLength::THREE;
            }
        }
    }
    return res;
}

size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
                                size_t start, bool modify, bool isWriteBuffer)
{
    if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
        return 0;
    }
    size_t utf8Pos = 0;
    size_t end = start + utf16Len;
    for (size_t i = start; i < end; ++i) {
        uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
        if (codepoint == 0) {
            if (isWriteBuffer) {
                utf8Out[utf8Pos++] = 0x00U;
                continue;
            }
            if (modify) {
                // special case for \u0000 ==> C080 - 1100'0000 1000'0000
                utf8Out[utf8Pos++] = UTF8_2B_FIRST;
                utf8Out[utf8Pos++] = UTF8_2B_SECOND;
            }
            continue;
        }
        utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
    }
    return utf8Pos;
}

size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
                                        size_t start, bool modify, bool isWriteBuffer)
{
    if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
        return 0;
    }
    size_t utf8Pos = 0;
    size_t end = start + utf16Len;
    for (size_t i = start; i < end; ++i) {
        uint32_t codepoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
        if (codepoint == 0) {
            if (isWriteBuffer) {
                utf8Out[utf8Pos++] = 0x00U;
                continue;
            }
            if (modify) {
                // special case for \u0000 ==> C080 - 1100'0000 1000'0000
                utf8Out[utf8Pos++] = UTF8_2B_FIRST;
                utf8Out[utf8Pos++] = UTF8_2B_SECOND;
            }
            continue;
        }
        utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
    }
    return utf8Pos;
}

std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
{
    uint8_t d0 = data[0];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
    if ((d0 & utf::MASK1) == 0) {
        return {d0, 1};
    }

    uint8_t d1 = data[1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
    if ((d0 & utf::MASK2) == 0) {
        return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
    }

    uint8_t d2 = data[UtfLength::TWO];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
    if ((d0 & utf::MASK3) == 0) {
        return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) |
                    (d2 & utf::MASK_6BIT),
                UtfLength::THREE};
    }

    uint8_t d3 = data[UtfLength::THREE];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
    uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
                         ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT);

    uint32_t pair = 0;
    if (combine) {
        uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD);
        uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
        pair = static_cast<uint32_t>(U16_GET_SUPPLEMENTARY(lead, tail));  // NOLINTNEXTLINE(hicpp-signed-bitwise)
    } else {
        pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH;
        pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
    }

    return {pair, UtfLength::FOUR};
}

size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
{
    return utf::MUtf8ToUtf16Size(utf8, utf8Len);
}

size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
                                size_t start)
{
    return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
}

size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len)
{
    if (utf16In == nullptr || latin1Out == nullptr || latin1Len == 0) {
        return 0;
    }
    size_t latin1Pos = 0;
    size_t end = utf16Len;
    for (size_t i = 0; i < end; ++i) {
        if (latin1Pos == latin1Len) {
            break;
        }
        uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
        uint8_t latin1Code = static_cast<uint8_t>(codepoint & latin1Limit);
        latin1Out[latin1Pos++] = latin1Code;
    }
    return latin1Pos;
}

std::pair<int32_t, size_t> ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen)
{
    if (maxLen == 0) {
        return {INVALID_UTF8, 0};
    }
    Span<const uint8_t> sp(utf8, maxLen);
    // one byte
    uint8_t d0 = sp[0];
    if ((d0 & BIT_MASK_1) == 0) {
        return {d0, UtfLength::ONE};
    }
    if (maxLen < UtfLength::TWO) {
        return {INVALID_UTF8, 0};
    }
    // two bytes
    uint8_t d1 = sp[UtfLength::ONE];
    if ((d0 & BIT_MASK_3) == BIT_MASK_2) {
        if ((d1 & BIT_MASK_2) == BIT_MASK_1) {
            return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
        } else {
            return {INVALID_UTF8, 0};
        }
    }
    if (maxLen < UtfLength::THREE) {
        return {INVALID_UTF8, 0};
    }
    // three bytes
    uint8_t d2 = sp[UtfLength::TWO];
    if ((d0 & BIT_MASK_4) == BIT_MASK_3) {
        if (((d1 & BIT_MASK_2) == BIT_MASK_1) && ((d2 & BIT_MASK_2) == BIT_MASK_1)) {
            return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) |
                ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d2 & utf::MASK_6BIT), UtfLength::THREE};
        } else {
            return {INVALID_UTF8, 0};
        }
    }
    if (maxLen < UtfLength::FOUR) {
        return {INVALID_UTF8, 0};
    }
    // four bytes
    uint8_t d3 = sp[UtfLength::THREE];
    if ((d0 & BIT_MASK_5) == BIT_MASK_4) {
        if (((d1 & BIT_MASK_2) == BIT_MASK_1) &&
            ((d2 & BIT_MASK_2) == BIT_MASK_1) && ((d3 & BIT_MASK_2) == BIT_MASK_1)) {
            return {((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
                ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT), UtfLength::FOUR};
        } else {
            return {INVALID_UTF8, 0};
        }
    }
    return {INVALID_UTF8, 0};
}
}  // namespace panda::ecmascript::base::utf_helper