/* * Copyright (c) 2021 Huawei Device Co., Ltd. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utils/utf.h" #include <cstdint> #include <vector> #include <gtest/gtest.h> namespace panda::utf::test { static uint16_t U16_lead(uint32_t codepoint) { return ((codepoint >> 10U) + 0xd7c0) & 0xffff; } static uint16_t U16_tail(uint32_t codepoint) { return (codepoint & 0x3ff) | 0xdc00; } TEST(Utf, ConvertMUtf8ToUtf16) { // 2-byte mutf-8 U+0000 { const std::vector<uint8_t> in {0xc0, 0x80, 0x00}; const std::vector<uint16_t> res {0x0}; std::vector<uint16_t> out(res.size()); ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data()); EXPECT_EQ(out, res); } // 1-byte mutf-8: 0xxxxxxx { const std::vector<uint8_t> in {0x7f, 0x00}; const std::vector<uint16_t> res {0x7f}; std::vector<uint16_t> out(res.size()); ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data()); EXPECT_EQ(out, res); } // 2-byte mutf-8: 110xxxxx 10xxxxxx { const std::vector<uint8_t> in {0xc2, 0xa7, 0x33, 0x00}; const std::vector<uint16_t> res {0xa7, 0x33}; std::vector<uint16_t> out(res.size()); ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data()); EXPECT_EQ(out, res); } // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx { const std::vector<uint8_t> in {0xef, 0xbf, 0x83, 0x33, 0x00}; const std::vector<uint16_t> res {0xffc3, 0x33}; std::vector<uint16_t> out(res.size()); ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data()); EXPECT_EQ(out, res); } // double 3-byte mutf-8: 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx { const std::vector<uint8_t> in {0xed, 0xa0, 0x81, 0xed, 0xb0, 0xb7, 0x00}; const std::vector<uint16_t> res {0xd801, 0xdc37}; std::vector<uint16_t> out(res.size()); ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data()); EXPECT_EQ(out, res); } { const std::vector<uint8_t> in {0x5b, 0x61, 0x62, 0x63, 0xed, 0xa3, 0x92, 0x5d, 0x00}; const std::vector<uint16_t> res {0x5b, 0x61, 0x62, 0x63, 0xd8d2, 0x5d}; std::vector<uint16_t> out(res.size()); ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data()); EXPECT_EQ(out, res); } { const std::vector<uint8_t> in {0xF0, 0x9F, 0x91, 0xB3, 0x00}; const std::vector<uint16_t> res {0xD83D, 0xDC73}; std::vector<uint16_t> out(res.size()); ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data()); EXPECT_EQ(out, res); } } TEST(Utf, Utf16ToMUtf8Size) { // 2-byte mutf-8 U+0000 { const std::vector<uint16_t> in {0x0}; size_t res = Utf16ToMUtf8Size(in.data(), in.size()); EXPECT_EQ(res, 3); } // 1-byte mutf-8: 0xxxxxxx { const std::vector<uint16_t> in {0x7f}; size_t res = Utf16ToMUtf8Size(in.data(), in.size()); EXPECT_EQ(res, 2); } // 2-byte mutf-8: 110xxxxx 10xxxxxx { const std::vector<uint16_t> in {0xa7, 0x33}; size_t res = Utf16ToMUtf8Size(in.data(), in.size()); EXPECT_EQ(res, 4); } // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx { const std::vector<uint16_t> in {0xffc3, 0x33}; size_t res = Utf16ToMUtf8Size(in.data(), in.size()); EXPECT_EQ(res, 5); } // double 3-byte mutf-8: 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx { const std::vector<uint16_t> in {0xd801, 0xdc37}; size_t res = Utf16ToMUtf8Size(in.data(), in.size()); EXPECT_EQ(res, 5); } } TEST(Utf, ConvertRegionUtf16ToMUtf8) { // 2-byte mutf-8 U+0000 { const std::vector<uint16_t> in {0x0}; const std::vector<uint8_t> res {0xc0, 0x80, 0x00}; std::vector<uint8_t> out(res.size()); size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0); EXPECT_EQ(sz, 2); out[out.size() - 1] = '\0'; EXPECT_EQ(out, res); } // 1-byte mutf-8: 0xxxxxxx { const std::vector<uint16_t> in {0x7f}; const std::vector<uint8_t> res {0x7f, 0x00}; std::vector<uint8_t> out(res.size()); size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0); EXPECT_EQ(sz, 1); out[out.size() - 1] = '\0'; EXPECT_EQ(out, res); } // 2-byte mutf-8: 110xxxxx 10xxxxxx { const std::vector<uint16_t> in {0xa7, 0x33}; const std::vector<uint8_t> res {0xc2, 0xa7, 0x33, 0x00}; std::vector<uint8_t> out(res.size()); size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0); EXPECT_EQ(sz, 3); out[out.size() - 1] = '\0'; EXPECT_EQ(out, res); } // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx { const std::vector<uint16_t> in {0xffc3, 0x33}; const std::vector<uint8_t> res {0xef, 0xbf, 0x83, 0x33, 0x00}; std::vector<uint8_t> out(res.size()); size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0); EXPECT_EQ(sz, 4); out[out.size() - 1] = '\0'; EXPECT_EQ(out, res); } // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx // utf-16 data in 0xd800-0xdfff { const std::vector<uint16_t> in {0xd834, 0x33}; const std::vector<uint8_t> res {0xed, 0xa0, 0xb4, 0x33, 0x00}; std::vector<uint8_t> out(res.size()); size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0); EXPECT_EQ(sz, 4); out[out.size() - 1] = '\0'; EXPECT_EQ(out, res); } // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx // utf-16 data in 0xd800-0xdfff { const std::vector<uint16_t> in {0xdf06, 0x33}; const std::vector<uint8_t> res {0xed, 0xbc, 0x86, 0x33, 0x00}; std::vector<uint8_t> out(res.size()); size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0); EXPECT_EQ(sz, 4); out[out.size() - 1] = '\0'; EXPECT_EQ(out, res); } // double 3-byte mutf-8: 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx { const std::vector<uint16_t> in {0xd801, 0xdc37}; const std::vector<uint8_t> res {0xf0, 0x90, 0x90, 0xb7, 0x00}; std::vector<uint8_t> out(res.size()); size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0); EXPECT_EQ(sz, 4); out[out.size() - 1] = '\0'; EXPECT_EQ(out, res); } } TEST(Utf, CompareMUtf8ToMUtf8) { // 1-byte utf-8: 0xxxxxxx { const std::vector<uint8_t> v1 {0x00}; const std::vector<uint8_t> v2 {0x7f, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0); } { const std::vector<uint8_t> v1 {0x02, 0x00}; const std::vector<uint8_t> v2 {0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0); } { const std::vector<uint8_t> v1 {0x7f, 0x00}; const std::vector<uint8_t> v2 {0x7f, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) == 0); } { const std::vector<uint8_t> v1 {0x01, 0x7f, 0x00}; const std::vector<uint8_t> v2 {0x01, 0x70, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0); } { const std::vector<uint8_t> v1 {0x01, 0x71, 0x00}; const std::vector<uint8_t> v2 {0x01, 0x73, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0); } // 2-byte utf-8: 110xxxxx 10xxxxxx { const std::vector<uint8_t> v1 {0xdf, 0xbf, 0x03, 0x00}; const std::vector<uint8_t> v2 {0xdf, 0xbf, 0x03, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) == 0); } { const std::vector<uint8_t> v1 {0xdf, 0xb1, 0x03, 0x00}; const std::vector<uint8_t> v2 {0xd1, 0xb2, 0x03, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0); } { const std::vector<uint8_t> v1 {0xd1, 0xbf, 0x03, 0x00}; const std::vector<uint8_t> v2 {0xdf, 0xb0, 0x03, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0); } // 3-byte utf-8: 1110xxxx 10xxxxxx 10xxxxxx { const std::vector<uint8_t> v1 {0xef, 0xbf, 0x03, 0x04, 0x00}; const std::vector<uint8_t> v2 {0xef, 0xbf, 0x03, 0x04, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) == 0); } { const std::vector<uint8_t> v1 {0xef, 0xb2, 0x03, 0x04, 0x00}; const std::vector<uint8_t> v2 {0xe0, 0xbf, 0x03, 0x04, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0); } { const std::vector<uint8_t> v1 {0xef, 0xb0, 0x03, 0x04, 0x00}; const std::vector<uint8_t> v2 {0xef, 0xbf, 0x05, 0x04, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0); } // 4-byte utf-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx { const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00}; const std::vector<uint8_t> v2 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) == 0); } { const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x0a, 0x05, 0x00}; const std::vector<uint8_t> v2 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0); } { const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00}; const std::vector<uint8_t> v2 {0xf8, 0xbf, 0xbf, 0x04, 0x05, 0x00}; EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0); } } TEST(Utf, CompareUtf8ToUtf8) { // 1-byte utf-8: 0xxxxxxx { const std::vector<uint8_t> v1 {0x00}; const std::vector<uint8_t> v2 {0x7f, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0); } { const std::vector<uint8_t> v1 {0x02, 0x00}; const std::vector<uint8_t> v2 {0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0); } { const std::vector<uint8_t> v1 {0x7f, 0x00}; const std::vector<uint8_t> v2 {0x7f, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) == 0); } { const std::vector<uint8_t> v1 {0x01, 0x7f, 0x00}; const std::vector<uint8_t> v2 {0x01, 0x70, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0); } { const std::vector<uint8_t> v1 {0x01, 0x71, 0x00}; const std::vector<uint8_t> v2 {0x01, 0x73, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0); } // 2-byte utf-8: 110xxxxx 10xxxxxx { const std::vector<uint8_t> v1 {0xdf, 0xbf, 0x03, 0x00}; const std::vector<uint8_t> v2 {0xdf, 0xbf, 0x03, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) == 0); } { const std::vector<uint8_t> v1 {0xdf, 0xb1, 0x03, 0x00}; const std::vector<uint8_t> v2 {0xd1, 0xb2, 0x03, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0); } { const std::vector<uint8_t> v1 {0xd1, 0xbf, 0x03, 0x00}; const std::vector<uint8_t> v2 {0xdf, 0xb0, 0x03, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0); } // 3-byte utf-8: 1110xxxx 10xxxxxx 10xxxxxx { const std::vector<uint8_t> v1 {0xef, 0xbf, 0x03, 0x04, 0x00}; const std::vector<uint8_t> v2 {0xef, 0xbf, 0x03, 0x04, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) == 0); } { const std::vector<uint8_t> v1 {0xef, 0xb2, 0x03, 0x04, 0x00}; const std::vector<uint8_t> v2 {0xe0, 0xbf, 0x03, 0x04, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0); } { const std::vector<uint8_t> v1 {0xef, 0xb0, 0x03, 0x04, 0x00}; const std::vector<uint8_t> v2 {0xef, 0xbf, 0x05, 0x04, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0); } // 4-byte utf-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx { const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00}; const std::vector<uint8_t> v2 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) == 0); } { const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x0a, 0x05, 0x00}; const std::vector<uint8_t> v2 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0); } { const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00}; const std::vector<uint8_t> v2 {0xf8, 0xbf, 0xbf, 0x04, 0x05, 0x00}; EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0); } } } // namespace panda::utf::test