1 /* 2 * Copyright (c) 2025 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #include <gtest/gtest.h> 17 18 #include <base/containers/string.h> 19 #include <base/containers/string_view.h> 20 #include <base/containers/vector.h> 21 #include <base/util/utf8_decode.h> 22 23 using namespace testing::ext; 24 25 namespace { 26 // clang-format off 27 constexpr const char STR_LAT[] = 28 u8"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut " 29 u8"labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco "; 30 31 constexpr const char STR_MATH[] = u8"\u2B30 \u2B48"; 32 constexpr const uint32_t STR_MATH_CPS[] = { 0x2b30, ' ', 0x2b48 }; 33 34 constexpr const char STR_CH[] = u8"快速的棕色狐狸跳过懒狗"; 35 constexpr const uint32_t STR_CH_CPS[] = { 0x5FEB, 0x901F, 0x7684, 0x68D5, 0x8272, 0x72D0, 0x72F8, 0x8DF3, 0x8FC7, 36 0x61D2, 0x72D7 }; 37 38 constexpr const char STR_RUN[] = u8"ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ" 39 u8"ᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ" 40 u8"ᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ"; 41 constexpr const uint32_t STR_RUN_CPS[] = { 0x16A0, 0x16C7, 0x16BB, 0x16EB, 0x16D2, 0x16E6, 0x16A6, 0x16EB, 0x16A0, 42 0x16B1, 0x16A9, 0x16A0, 0x16A2, 0x16B1, 0x16EB, 0x16A0, 0x16C1, 0x16B1, 0x16AA, 0x16EB, 0x16B7, 0x16D6, 0x16BB, 43 0x16B9, 0x16E6, 0x16DA, 0x16B3, 0x16A2, 0x16D7, 0x16CB, 0x16B3, 0x16D6, 0x16AA, 0x16DA, 0x16EB, 0x16A6, 0x16D6, 44 0x16AA, 0x16BB, 0x16EB, 0x16D7, 0x16AA, 0x16BE, 0x16BE, 0x16AA, 0x16EB, 0x16B7, 0x16D6, 0x16BB, 0x16B9, 0x16E6, 45 0x16DA, 0x16B3, 0x16EB, 0x16D7, 0x16C1, 0x16B3, 0x16DA, 0x16A2, 0x16BE, 0x16EB, 0x16BB, 0x16E6, 0x16CF, 0x16EB, 46 0x16DE, 0x16AB, 0x16DA, 0x16AA, 0x16BE, 0x16B7, 0x16C1, 0x16A0, 0x16EB, 0x16BB, 0x16D6, 0x16EB, 0x16B9, 0x16C1, 47 0x16DA, 0x16D6, 0x16EB, 0x16A0, 0x16A9, 0x16B1, 0x16EB, 0x16DE, 0x16B1, 0x16C1, 0x16BB, 0x16CF, 0x16BE, 0x16D6, 48 0x16EB, 0x16DE, 0x16A9, 0x16D7, 0x16D6, 0x16CB, 0x16EB, 0x16BB, 0x16DA, 0x16C7, 0x16CF, 0x16AA, 0x16BE }; 49 50 constexpr const char STR_FIN[] = u8"Törkylempijävongahdus"; 51 constexpr const uint32_t STR_FIN_CPS[] = { 0x0054, 0x00F6, 0x0072, 0x006B, 0x0079, 0x006C, 0x0065, 0x006D, 0x0070, 52 0x0069, 0x006A, 0x00E4, 0x0076, 0x006F, 0x006E, 0x0067, 0x0061, 0x0068, 0x0064, 0x0075, 0x0073 }; 53 54 constexpr const char STR_BOUNDARIES[] = u8"\u0080\u0800\u007F\u07FF\uFFFF\uD7FF\uE000\uFFFD"; 55 constexpr const uint32_t STR_BOUNDARIES_CPS[] = { 0x0080, 0x0800, 0x007F, 0x07FF, 0xFFFF, 0xD7FF, 0xE000, 0xFFFD }; 56 57 //https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 58 constexpr const char STR_INVALID[][7] = { 59 // Unexpected continuation bytes 60 { '\x80' }, 61 { '\xBF' }, 62 // The following two bytes cannot appear in a correct UTF-8 string 63 { '\xFE' }, 64 { '\xFF' }, 65 { '\xFE', '\xFE', '\xFF', '\xFF' }, 66 67 // ================== 68 // Overlong sequences 69 // Maximum overlong sequences 70 { '\xC1', '\xBF' }, 71 { '\xE0', '\x9F', '\xBF'}, 72 { '\xfc', '\x83', '\xbf', '\xbf', '\xbf', '\xbf' }, 73 74 // Examples of an overlong ASCII character 75 { '\xC0', '\xAF' }, 76 { '\xFC', '\x80', '\x80', '\x80', '\x80', '\xAF' }, 77 78 // Overlong representation of the NUL character 79 { '\xC0', '\x80' }, 80 { '\xE0', '\x80', '\x80' }, 81 { '\xF0', '\x80', '\x80', '\x80' }, 82 { '\xF8', '\x80', '\x80', '\x80', '\x80' }, 83 { '\xFC', '\x80', '\x80', '\x80', '\x80', '\x80' }, 84 85 // ================== 86 // Illegal code positions 87 // Single UTF-16 surrogates 88 { '\xED', '\xA0', '\x80' }, 89 { '\xED', '\xAD', '\xBF' }, 90 { '\xED', '\xAE', '\x80' }, 91 { '\xED', '\xBF', '\xBF' }, 92 // Paired UTF-16 surrogates 93 { '\xED', '\xA0', '\x80', '\xED', '\xB0', '\x80' }, 94 { '\xED', '\xAF', '\xBF', '\xED', '\xBF', '\xBF' }, 95 96 // ================== 97 // Other illegal code positions 98 }; 99 // clang-format on 100 } // namespace 101 102 class Utf8Test : public testing::Test { 103 public: SetUpTestSuite()104 static void SetUpTestSuite() {} TearDownTestSuite()105 static void TearDownTestSuite() {} SetUp()106 void SetUp() override {} TearDown()107 void TearDown() override {} 108 }; 109 110 HWTEST_F(Utf8Test, Count, TestSize.Level1) 111 { 112 EXPECT_EQ(BASE_NS::CountGlyphsUtf8(STR_LAT), 183U); 113 EXPECT_EQ(BASE_NS::CountGlyphsUtf8(STR_CH), 11U); 114 EXPECT_EQ(BASE_NS::CountGlyphsUtf8(STR_RUN), 106U); 115 EXPECT_EQ(BASE_NS::CountGlyphsUtf8(STR_FIN), 21U); 116 EXPECT_EQ(BASE_NS::CountGlyphsUtf8(STR_BOUNDARIES), 8U); 117 118 for (const auto& invalid : STR_INVALID) { 119 EXPECT_EQ(BASE_NS::CountGlyphsUtf8(invalid), 0U); 120 } 121 } 122 123 HWTEST_F(Utf8Test, Decode, TestSize.Level1) 124 { 125 uint32_t codepoint = 0; 126 127 const char* ptr = STR_MATH; 128 for (uint32_t i = 0; (codepoint = BASE_NS::GetCharUtf8(&ptr), codepoint); i++) { 129 EXPECT_EQ(codepoint, STR_MATH_CPS[i]); 130 } 131 ptr = STR_CH; 132 for (uint32_t i = 0; (codepoint = BASE_NS::GetCharUtf8(&ptr), codepoint); i++) { 133 EXPECT_EQ(codepoint, STR_CH_CPS[i]); 134 } 135 ptr = STR_RUN; 136 for (uint32_t i = 0; (codepoint = BASE_NS::GetCharUtf8(&ptr), codepoint); i++) { 137 EXPECT_EQ(codepoint, STR_RUN_CPS[i]); 138 } 139 ptr = STR_FIN; 140 for (uint32_t i = 0; (codepoint = BASE_NS::GetCharUtf8(&ptr), codepoint); i++) { 141 EXPECT_EQ(codepoint, STR_FIN_CPS[i]); 142 } 143 ptr = STR_BOUNDARIES; 144 for (uint32_t i = 0; (codepoint = BASE_NS::GetCharUtf8(&ptr), codepoint); i++) { 145 EXPECT_EQ(codepoint, STR_BOUNDARIES_CPS[i]); 146 } 147 for (size_t i = 0; i < 21; i++) { 148 ptr = STR_INVALID[i]; 149 EXPECT_EQ(BASE_NS::GetCharUtf8(&ptr), 0); 150 } 151 } 152