• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2014 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 // All data that is passed through a WebSocket with type "Text" needs to be
11 // validated as UTF8. Since this is done on the IO thread, it needs to be
12 // reasonably fast.
13 
14 // We are only interested in the performance on valid UTF8. Invalid UTF8 will
15 // result in a connection failure, so is unlikely to become a source of
16 // performance issues.
17 
18 #include "base/i18n/streaming_utf8_validator.h"
19 
20 #include <stddef.h>
21 
22 #include <string>
23 #include <string_view>
24 
25 #include "base/functional/bind.h"
26 #include "base/functional/callback.h"
27 #include "base/strings/string_util.h"
28 #include "base/strings/stringprintf.h"
29 #include "base/test/perf_time_logger.h"
30 #include "testing/gtest/include/gtest/gtest.h"
31 
32 namespace base {
33 namespace {
34 
35 // We want to test ranges of valid UTF-8 sequences. These ranges are inclusive.
36 // They are intended to be large enough that the validator needs to do
37 // meaningful work while being in some sense "realistic" (eg. control characters
38 // are not included).
39 const char kOneByteSeqRangeStart[] = " ";  // U+0020
40 const char kOneByteSeqRangeEnd[] = "~";    // U+007E
41 
42 const char kTwoByteSeqRangeStart[] = "\xc2\xa0";  // U+00A0 non-breaking space
43 const char kTwoByteSeqRangeEnd[] = "\xc9\x8f";    // U+024F small y with stroke
44 
45 const char kThreeByteSeqRangeStart[] = "\xe3\x81\x82";  // U+3042 Hiragana "a"
46 const char kThreeByteSeqRangeEnd[] = "\xe9\xbf\x83";    // U+9FC3 "to blink"
47 
48 const char kFourByteSeqRangeStart[] = "\xf0\xa0\x80\x8b";  // U+2000B
49 const char kFourByteSeqRangeEnd[] = "\xf0\xaa\x9a\xb2";    // U+2A6B2
50 
51 // The different lengths of strings to test.
52 const size_t kTestLengths[] = {1, 32, 256, 32768, 1 << 20};
53 
54 // Simplest possible byte-at-a-time validator, to provide a baseline
55 // for comparison. This is only tried on 1-byte UTF-8 sequences, as
56 // the results will not be meaningful with sequences containing
57 // top-bit-set bytes.
IsString7Bit(const std::string & s)58 bool IsString7Bit(const std::string& s) {
59   for (auto it : s) {
60     if (it & 0x80)
61       return false;
62   }
63   return true;
64 }
65 
66 // Assumes that |previous| is a valid UTF-8 sequence, and attempts to return
67 // the next one. Is just barely smart enough to iterate through the ranges
68 // defined about.
NextUtf8Sequence(const std::string & previous)69 std::string NextUtf8Sequence(const std::string& previous) {
70   DCHECK(StreamingUtf8Validator::Validate(previous));
71   std::string next = previous;
72   for (int i = static_cast<int>(previous.length() - 1); i >= 0; --i) {
73     // All bytes in a UTF-8 sequence except the first one are
74     // constrained to the range 0x80 to 0xbf, inclusive. When we
75     // increment past 0xbf, we carry into the previous byte.
76     if (i > 0 && next[i] == '\xbf') {
77       next[i] = '\x80';
78       continue;  // carry
79     }
80     ++next[i];
81     break;  // no carry
82   }
83   DCHECK(StreamingUtf8Validator::Validate(next))
84       << "Result \"" << next << "\" failed validation";
85   return next;
86 }
87 
88 typedef bool (*TestTargetType)(const std::string&);
89 
90 // Run fuction |target| over |test_string| |times| times, and report the results
91 // using |description|.
RunTest(const std::string & description,TestTargetType target,const std::string & test_string,int times)92 bool RunTest(const std::string& description,
93              TestTargetType target,
94              const std::string& test_string,
95              int times) {
96   base::PerfTimeLogger timer(description.c_str());
97   bool result = true;
98   for (int i = 0; i < times; ++i) {
99     result = target(test_string) && result;
100   }
101   timer.Done();
102   return result;
103 }
104 
105 // Construct a string by repeating |input| enough times to equal or exceed
106 // |length|.
ConstructRepeatedTestString(const std::string & input,size_t length)107 std::string ConstructRepeatedTestString(const std::string& input,
108                                         size_t length) {
109   std::string output = input;
110   while (output.length() * 2 < length) {
111     output += output;
112   }
113   if (output.length() < length) {
114     output += ConstructRepeatedTestString(input, length - output.length());
115   }
116   return output;
117 }
118 
119 // Construct a string by expanding the range of UTF-8 sequences
120 // between |input_start| and |input_end|, inclusive, and then
121 // repeating the resulting string until it equals or exceeds |length|
122 // bytes. |input_start| and |input_end| must be valid UTF-8
123 // sequences.
ConstructRangedTestString(const std::string & input_start,const std::string & input_end,size_t length)124 std::string ConstructRangedTestString(const std::string& input_start,
125                                       const std::string& input_end,
126                                       size_t length) {
127   std::string output = input_start;
128   std::string input = input_start;
129   while (output.length() < length && input != input_end) {
130     input = NextUtf8Sequence(input);
131     output += input;
132   }
133   if (output.length() < length) {
134     output = ConstructRepeatedTestString(output, length);
135   }
136   return output;
137 }
138 
139 struct TestFunctionDescription {
140   TestTargetType function;
141   const char* function_name;
142 };
143 
IsStringUTF8(const std::string & str)144 bool IsStringUTF8(const std::string& str) {
145   return base::IsStringUTF8(std::string_view(str));
146 }
147 
148 // IsString7Bit is intentionally placed last so it can be excluded easily.
149 const TestFunctionDescription kTestFunctions[] = {
150     {&StreamingUtf8Validator::Validate, "StreamingUtf8Validator"},
151     {&IsStringUTF8, "IsStringUTF8"}, {&IsString7Bit, "IsString7Bit"}};
152 
153 // Construct a test string from |construct_test_string| for each of the lengths
154 // in |kTestLengths| in turn. For each string, run each test in |test_functions|
155 // for a number of iterations such that the total number of bytes validated
156 // is around 16MB.
RunSomeTests(const char format[],base::RepeatingCallback<std::string (size_t length)> construct_test_string,const TestFunctionDescription * test_functions,size_t test_count)157 void RunSomeTests(
158     const char format[],
159     base::RepeatingCallback<std::string(size_t length)> construct_test_string,
160     const TestFunctionDescription* test_functions,
161     size_t test_count) {
162   for (auto length : kTestLengths) {
163     const std::string test_string = construct_test_string.Run(length);
164     const int real_length = static_cast<int>(test_string.length());
165     const int times = (1 << 24) / real_length;
166     for (size_t test_index = 0; test_index < test_count; ++test_index) {
167       EXPECT_TRUE(RunTest(StringPrintfNonConstexpr(
168                               format, test_functions[test_index].function_name,
169                               real_length, times),
170                           test_functions[test_index].function, test_string,
171                           times));
172     }
173   }
174 }
175 
TEST(StreamingUtf8ValidatorPerfTest,OneByteRepeated)176 TEST(StreamingUtf8ValidatorPerfTest, OneByteRepeated) {
177   RunSomeTests(
178       "%s: bytes=1 repeated length=%d repeat=%d",
179       base::BindRepeating(ConstructRepeatedTestString, kOneByteSeqRangeStart),
180       kTestFunctions, 3);
181 }
182 
TEST(StreamingUtf8ValidatorPerfTest,OneByteRange)183 TEST(StreamingUtf8ValidatorPerfTest, OneByteRange) {
184   RunSomeTests("%s: bytes=1 ranged length=%d repeat=%d",
185                base::BindRepeating(ConstructRangedTestString,
186                                    kOneByteSeqRangeStart, kOneByteSeqRangeEnd),
187                kTestFunctions, 3);
188 }
189 
TEST(StreamingUtf8ValidatorPerfTest,TwoByteRepeated)190 TEST(StreamingUtf8ValidatorPerfTest, TwoByteRepeated) {
191   RunSomeTests(
192       "%s: bytes=2 repeated length=%d repeat=%d",
193       base::BindRepeating(ConstructRepeatedTestString, kTwoByteSeqRangeStart),
194       kTestFunctions, 2);
195 }
196 
TEST(StreamingUtf8ValidatorPerfTest,TwoByteRange)197 TEST(StreamingUtf8ValidatorPerfTest, TwoByteRange) {
198   RunSomeTests("%s: bytes=2 ranged length=%d repeat=%d",
199                base::BindRepeating(ConstructRangedTestString,
200                                    kTwoByteSeqRangeStart, kTwoByteSeqRangeEnd),
201                kTestFunctions, 2);
202 }
203 
TEST(StreamingUtf8ValidatorPerfTest,ThreeByteRepeated)204 TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRepeated) {
205   RunSomeTests(
206       "%s: bytes=3 repeated length=%d repeat=%d",
207       base::BindRepeating(ConstructRepeatedTestString, kThreeByteSeqRangeStart),
208       kTestFunctions, 2);
209 }
210 
TEST(StreamingUtf8ValidatorPerfTest,ThreeByteRange)211 TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRange) {
212   RunSomeTests(
213       "%s: bytes=3 ranged length=%d repeat=%d",
214       base::BindRepeating(ConstructRangedTestString, kThreeByteSeqRangeStart,
215                           kThreeByteSeqRangeEnd),
216       kTestFunctions, 2);
217 }
218 
TEST(StreamingUtf8ValidatorPerfTest,FourByteRepeated)219 TEST(StreamingUtf8ValidatorPerfTest, FourByteRepeated) {
220   RunSomeTests(
221       "%s: bytes=4 repeated length=%d repeat=%d",
222       base::BindRepeating(ConstructRepeatedTestString, kFourByteSeqRangeStart),
223       kTestFunctions, 2);
224 }
225 
TEST(StreamingUtf8ValidatorPerfTest,FourByteRange)226 TEST(StreamingUtf8ValidatorPerfTest, FourByteRange) {
227   RunSomeTests(
228       "%s: bytes=4 ranged length=%d repeat=%d",
229       base::BindRepeating(ConstructRangedTestString, kFourByteSeqRangeStart,
230                           kFourByteSeqRangeEnd),
231       kTestFunctions, 2);
232 }
233 
234 }  // namespace
235 }  // namespace base
236