• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2025 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include <gtest/gtest.h>
17 
18 #include <base/containers/string.h>
19 #include <base/containers/string_view.h>
20 #include <base/containers/vector.h>
21 #include <base/util/utf8_decode.h>
22 
23 using namespace testing::ext;
24 
25 namespace {
26 // clang-format off
27 constexpr const char STR_LAT[] =
28     u8"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut "
29     u8"labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco ";
30 
31 constexpr const char STR_MATH[] = u8"\u2B30 \u2B48";
32 constexpr const uint32_t STR_MATH_CPS[] = { 0x2b30, ' ', 0x2b48 };
33 
34 constexpr const char STR_CH[] = u8"快速的棕色狐狸跳过懒狗";
35 constexpr const uint32_t STR_CH_CPS[] = { 0x5FEB, 0x901F, 0x7684, 0x68D5, 0x8272, 0x72D0, 0x72F8, 0x8DF3, 0x8FC7,
36     0x61D2, 0x72D7 };
37 
38 constexpr const char STR_RUN[] = u8"ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ"
39                                  u8"ᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ"
40                                  u8"ᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ";
41 constexpr const uint32_t STR_RUN_CPS[] = { 0x16A0, 0x16C7, 0x16BB, 0x16EB, 0x16D2, 0x16E6, 0x16A6, 0x16EB, 0x16A0,
42     0x16B1, 0x16A9, 0x16A0, 0x16A2, 0x16B1, 0x16EB, 0x16A0, 0x16C1, 0x16B1, 0x16AA, 0x16EB, 0x16B7, 0x16D6, 0x16BB,
43     0x16B9, 0x16E6, 0x16DA, 0x16B3, 0x16A2, 0x16D7, 0x16CB, 0x16B3, 0x16D6, 0x16AA, 0x16DA, 0x16EB, 0x16A6, 0x16D6,
44     0x16AA, 0x16BB, 0x16EB, 0x16D7, 0x16AA, 0x16BE, 0x16BE, 0x16AA, 0x16EB, 0x16B7, 0x16D6, 0x16BB, 0x16B9, 0x16E6,
45     0x16DA, 0x16B3, 0x16EB, 0x16D7, 0x16C1, 0x16B3, 0x16DA, 0x16A2, 0x16BE, 0x16EB, 0x16BB, 0x16E6, 0x16CF, 0x16EB,
46     0x16DE, 0x16AB, 0x16DA, 0x16AA, 0x16BE, 0x16B7, 0x16C1, 0x16A0, 0x16EB, 0x16BB, 0x16D6, 0x16EB, 0x16B9, 0x16C1,
47     0x16DA, 0x16D6, 0x16EB, 0x16A0, 0x16A9, 0x16B1, 0x16EB, 0x16DE, 0x16B1, 0x16C1, 0x16BB, 0x16CF, 0x16BE, 0x16D6,
48     0x16EB, 0x16DE, 0x16A9, 0x16D7, 0x16D6, 0x16CB, 0x16EB, 0x16BB, 0x16DA, 0x16C7, 0x16CF, 0x16AA, 0x16BE };
49 
50 constexpr const char STR_FIN[] = u8"Törkylempijävongahdus";
51 constexpr const uint32_t STR_FIN_CPS[] = { 0x0054, 0x00F6, 0x0072, 0x006B, 0x0079, 0x006C, 0x0065, 0x006D, 0x0070,
52     0x0069, 0x006A, 0x00E4, 0x0076, 0x006F, 0x006E, 0x0067, 0x0061, 0x0068, 0x0064, 0x0075, 0x0073 };
53 
54 constexpr const char STR_BOUNDARIES[] = u8"\u0080\u0800\u007F\u07FF\uFFFF\uD7FF\uE000\uFFFD";
55 constexpr const uint32_t STR_BOUNDARIES_CPS[] = { 0x0080, 0x0800, 0x007F, 0x07FF, 0xFFFF, 0xD7FF, 0xE000, 0xFFFD };
56 
57 //https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html
58 constexpr const char STR_INVALID[][7] = {
59     // Unexpected continuation bytes
60     { '\x80' },
61     { '\xBF' },
62     // The following two bytes cannot appear in a correct UTF-8 string
63     { '\xFE' },
64     { '\xFF' },
65     { '\xFE', '\xFE', '\xFF', '\xFF' },
66 
67     // ==================
68     // Overlong sequences
69     // Maximum overlong sequences
70     { '\xC1', '\xBF' },
71     { '\xE0', '\x9F', '\xBF'},
72     { '\xfc', '\x83', '\xbf', '\xbf', '\xbf', '\xbf' },
73 
74     // Examples of an overlong ASCII character
75     { '\xC0', '\xAF' },
76     { '\xFC', '\x80', '\x80', '\x80', '\x80', '\xAF' },
77 
78     // Overlong representation of the NUL character
79     { '\xC0', '\x80' },
80     { '\xE0', '\x80', '\x80' },
81     { '\xF0', '\x80', '\x80', '\x80' },
82     { '\xF8', '\x80', '\x80', '\x80', '\x80' },
83     { '\xFC', '\x80', '\x80', '\x80', '\x80', '\x80' },
84 
85     // ==================
86     // Illegal code positions
87     // Single UTF-16 surrogates
88     { '\xED', '\xA0', '\x80' },
89     { '\xED', '\xAD', '\xBF' },
90     { '\xED', '\xAE', '\x80' },
91     { '\xED', '\xBF', '\xBF' },
92     // Paired UTF-16 surrogates
93     { '\xED', '\xA0', '\x80', '\xED', '\xB0', '\x80' },
94     { '\xED', '\xAF', '\xBF', '\xED', '\xBF', '\xBF' },
95 
96     // ==================
97     // Other illegal code positions
98 };
99 // clang-format on
100 } // namespace
101 
102 class Utf8Test : public testing::Test {
103 public:
SetUpTestSuite()104     static void SetUpTestSuite() {}
TearDownTestSuite()105     static void TearDownTestSuite() {}
SetUp()106     void SetUp() override {}
TearDown()107     void TearDown() override {}
108 };
109 
110 HWTEST_F(Utf8Test, Count, TestSize.Level1)
111 {
112     EXPECT_EQ(BASE_NS::CountGlyphsUtf8(STR_LAT), 183U);
113     EXPECT_EQ(BASE_NS::CountGlyphsUtf8(STR_CH), 11U);
114     EXPECT_EQ(BASE_NS::CountGlyphsUtf8(STR_RUN), 106U);
115     EXPECT_EQ(BASE_NS::CountGlyphsUtf8(STR_FIN), 21U);
116     EXPECT_EQ(BASE_NS::CountGlyphsUtf8(STR_BOUNDARIES), 8U);
117 
118     for (const auto& invalid : STR_INVALID) {
119         EXPECT_EQ(BASE_NS::CountGlyphsUtf8(invalid), 0U);
120     }
121 }
122 
123 HWTEST_F(Utf8Test, Decode, TestSize.Level1)
124 {
125     uint32_t codepoint = 0;
126 
127     const char* ptr = STR_MATH;
128     for (uint32_t i = 0; (codepoint = BASE_NS::GetCharUtf8(&ptr), codepoint); i++) {
129         EXPECT_EQ(codepoint, STR_MATH_CPS[i]);
130     }
131     ptr = STR_CH;
132     for (uint32_t i = 0; (codepoint = BASE_NS::GetCharUtf8(&ptr), codepoint); i++) {
133         EXPECT_EQ(codepoint, STR_CH_CPS[i]);
134     }
135     ptr = STR_RUN;
136     for (uint32_t i = 0; (codepoint = BASE_NS::GetCharUtf8(&ptr), codepoint); i++) {
137         EXPECT_EQ(codepoint, STR_RUN_CPS[i]);
138     }
139     ptr = STR_FIN;
140     for (uint32_t i = 0; (codepoint = BASE_NS::GetCharUtf8(&ptr), codepoint); i++) {
141         EXPECT_EQ(codepoint, STR_FIN_CPS[i]);
142     }
143     ptr = STR_BOUNDARIES;
144     for (uint32_t i = 0; (codepoint = BASE_NS::GetCharUtf8(&ptr), codepoint); i++) {
145         EXPECT_EQ(codepoint, STR_BOUNDARIES_CPS[i]);
146     }
147     for (size_t i = 0; i < 21; i++) {
148         ptr = STR_INVALID[i];
149         EXPECT_EQ(BASE_NS::GetCharUtf8(&ptr), 0);
150     }
151 }
152