• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2014 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 //
8 // Note: U8_NEXT couldn't be rewritten using the spanification_tool, because it
9 // is a macro.
10 #pragma allow_unsafe_buffers
11 #endif
12 
13 #include "base/i18n/streaming_utf8_validator.h"
14 
15 #include <stddef.h>
16 #include <stdint.h>
17 #include <stdio.h>
18 #include <string.h>
19 
20 #include <string>
21 #include <string_view>
22 
23 #include "base/containers/span.h"
24 #include "base/functional/bind.h"
25 #include "base/location.h"
26 #include "base/logging.h"
27 #include "base/memory/ref_counted.h"
28 #include "base/strings/string_util.h"
29 #include "base/strings/stringprintf.h"
30 #include "base/strings/utf_string_conversion_utils.h"
31 #include "base/synchronization/lock.h"
32 #include "base/task/thread_pool.h"
33 #include "base/test/task_environment.h"
34 #include "testing/gtest/include/gtest/gtest.h"
35 #include "third_party/icu/source/common/unicode/utf8.h"
36 
37 namespace base {
38 namespace {
39 
40 // Avoid having to qualify the enum values in the tests.
41 const StreamingUtf8Validator::State VALID_ENDPOINT =
42     StreamingUtf8Validator::VALID_ENDPOINT;
43 const StreamingUtf8Validator::State VALID_MIDPOINT =
44     StreamingUtf8Validator::VALID_MIDPOINT;
45 const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID;
46 
47 const uint32_t kThoroughTestChunkSize = 1 << 24;
48 
49 class StreamingUtf8ValidatorThoroughTest : public ::testing::Test {
50  protected:
51   StreamingUtf8ValidatorThoroughTest() = default;
52 
53   // This uses the same logic as base::IsStringUTF8 except it considers
54   // non-characters valid (and doesn't require a string as input).
IsStringUtf8(base::span<const uint8_t> src)55   static bool IsStringUtf8(base::span<const uint8_t> src) {
56     size_t char_index = 0;
57     while (char_index < src.size()) {
58       base_icu::UChar32 code_point;
59       U8_NEXT(src, char_index, src.size(), code_point);
60       if (!base::IsValidCodepoint(code_point))
61         return false;
62     }
63     return true;
64   }
65 
66   // Converts the passed-in integer to a 4 byte string and then
67   // verifies that IsStringUtf8 and StreamingUtf8Validator agree on
68   // whether it is valid UTF-8 or not.
TestNumber(uint32_t n) const69   void TestNumber(uint32_t n) const {
70     uint8_t test[sizeof n];
71     memcpy(test, &n, sizeof n);
72     StreamingUtf8Validator validator;
73     EXPECT_EQ(IsStringUtf8(test), validator.AddBytes(test) == VALID_ENDPOINT)
74         << "Difference of opinion for \""
75         << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X", test[0], test[1],
76                               test[2], test[3])
77         << "\"";
78   }
79 
80  public:
81   // Tests the 4-byte sequences corresponding to the |size| integers
82   // starting at |begin|. This is intended to be run from a worker
83   // pool. Signals |all_done_| at the end if it thinks all tasks are
84   // finished.
TestRange(uint32_t begin,uint32_t size)85   void TestRange(uint32_t begin, uint32_t size) {
86     for (uint32_t i = 0; i < size; ++i) {
87       TestNumber(begin + i);
88     }
89     base::AutoLock al(lock_);
90     ++tasks_finished_;
91     LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_
92               << " tasks done\n";
93   }
94 
95  protected:
96   base::Lock lock_;
97   int tasks_dispatched_ = 0;
98   int tasks_finished_ = 0;
99 };
100 
101 // Enable locally to verify that this class accepts exactly the same set of
102 // 4-byte strings as ICU-based validation. This tests every possible 4-byte
103 // string, so it is too slow to run routinely on low-powered machines.
TEST_F(StreamingUtf8ValidatorThoroughTest,DISABLED_TestEverything)104 TEST_F(StreamingUtf8ValidatorThoroughTest, DISABLED_TestEverything) {
105   base::test::TaskEnvironment task_environment;
106   {
107     base::AutoLock al(lock_);
108     uint32_t begin = 0;
109     do {
110       base::ThreadPool::PostTask(
111           FROM_HERE, {base::TaskShutdownBehavior::BLOCK_SHUTDOWN},
112           base::BindOnce(&StreamingUtf8ValidatorThoroughTest::TestRange,
113                          base::Unretained(this), begin,
114                          kThoroughTestChunkSize));
115       ++tasks_dispatched_;
116       begin += kThoroughTestChunkSize;
117     } while (begin != 0);
118   }
119 }
120 
121 // These valid and invalid UTF-8 sequences are based on the tests from
122 // base/strings/string_util_unittest.cc
123 
124 // All of the strings in |valid| must represent a single codepoint, because
125 // partial sequences are constructed by taking non-empty prefixes of these
126 // strings.
127 const char* const valid[] = {"\r",           "\n",           "a",
128                              "\xc2\x81",     "\xe1\x80\xbf", "\xf1\x80\xa0\xbf",
129                              "\xef\xbb\xbf",  // UTF-8 BOM
130 };
131 
132 const char* const* const valid_end = valid + std::size(valid);
133 
134 const char* const invalid[] = {
135     // always invalid bytes
136     "\xc0", "\xc1",
137     "\xf5", "\xf6", "\xf7",
138     "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff",
139     // surrogate code points
140     "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf",
141     //
142     // overlong sequences
143     "\xc0\x80",              // U+0000
144     "\xc1\x80",              // "A"
145     "\xc1\x81",              // "B"
146     "\xe0\x80\x80",          // U+0000
147     "\xe0\x82\x80",          // U+0080
148     "\xe0\x9f\xbf",          // U+07ff
149     "\xf0\x80\x80\x8D",      // U+000D
150     "\xf0\x80\x82\x91",      // U+0091
151     "\xf0\x80\xa0\x80",      // U+0800
152     "\xf0\x8f\xbb\xbf",      // U+FEFF (BOM)
153     "\xf8\x80\x80\x80\xbf",  // U+003F
154     "\xfc\x80\x80\x80\xa0\xa5",
155     //
156     // Beyond U+10FFFF
157     "\xf4\x90\x80\x80",          // U+110000
158     "\xf8\xa0\xbf\x80\xbf",      // 5 bytes
159     "\xfc\x9c\xbf\x80\xbf\x80",  // 6 bytes
160     //
161     // BOMs in UTF-16(BE|LE)
162     "\xfe\xff", "\xff\xfe",
163 };
164 
165 const char* const* const invalid_end = invalid + std::size(invalid);
166 
167 // A ForwardIterator which returns all the non-empty prefixes of the elements of
168 // "valid".
169 class PartialIterator {
170  public:
171   // The constructor returns the first iterator, ie. it is equivalent to
172   // begin().
PartialIterator()173   PartialIterator() : index_(0), prefix_length_(0) { Advance(); }
174   // The trivial destructor left intentionally undefined.
175   // This is a value type; the default copy constructor and assignment operator
176   // generated by the compiler are used.
177 
end()178   static PartialIterator end() { return PartialIterator(std::size(valid), 1); }
179 
operator ++()180   PartialIterator& operator++() {
181     Advance();
182     return *this;
183   }
184 
operator *() const185   std::string_view operator*() const {
186     return std::string_view(valid[index_], prefix_length_);
187   }
188 
operator ==(const PartialIterator & rhs) const189   bool operator==(const PartialIterator& rhs) const {
190     return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_;
191   }
192 
operator !=(const PartialIterator & rhs) const193   bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); }
194 
195  private:
196   // This constructor is used by the end() method.
PartialIterator(size_t index,size_t prefix_length)197   PartialIterator(size_t index, size_t prefix_length)
198       : index_(index), prefix_length_(prefix_length) {}
199 
Advance()200   void Advance() {
201     if (index_ < std::size(valid) && prefix_length_ < strlen(valid[index_]))
202       ++prefix_length_;
203     while (index_ < std::size(valid) &&
204            prefix_length_ == strlen(valid[index_])) {
205       ++index_;
206       prefix_length_ = 1;
207     }
208   }
209 
210   // The UTF-8 sequence, as an offset into the |valid| array.
211   size_t index_;
212   size_t prefix_length_;
213 };
214 
215 // A test fixture for tests which test one UTF-8 sequence (or invalid
216 // byte sequence) at a time.
217 class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test {
218  protected:
219   // Iterator must be convertible when de-referenced to std::string_view.
220   template <typename Iterator>
CheckRange(Iterator begin,Iterator end,StreamingUtf8Validator::State expected)221   void CheckRange(Iterator begin,
222                   Iterator end,
223                   StreamingUtf8Validator::State expected) {
224     for (Iterator it = begin; it != end; ++it) {
225       StreamingUtf8Validator validator;
226       std::string_view sequence = *it;
227       EXPECT_EQ(expected, validator.AddBytes(base::as_byte_span(sequence)))
228           << "Failed for \"" << sequence << "\"";
229     }
230   }
231 
232   // Adding input a byte at a time should make absolutely no difference.
233   template <typename Iterator>
CheckRangeByteAtATime(Iterator begin,Iterator end,StreamingUtf8Validator::State expected)234   void CheckRangeByteAtATime(Iterator begin,
235                              Iterator end,
236                              StreamingUtf8Validator::State expected) {
237     for (Iterator it = begin; it != end; ++it) {
238       StreamingUtf8Validator validator;
239       std::string_view sequence = *it;
240       StreamingUtf8Validator::State state = VALID_ENDPOINT;
241       for (const auto& cit : sequence) {
242         state = validator.AddBytes(base::byte_span_from_ref(cit));
243       }
244       EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\"";
245     }
246   }
247 };
248 
249 // A test fixture for tests which test the concatenation of byte sequences.
250 class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test {
251  protected:
252   // Check every possible concatenation of byte sequences from two
253   // ranges, and verify that the combination matches the expected
254   // state.
255   template <typename Iterator1, typename Iterator2>
CheckCombinations(Iterator1 begin1,Iterator1 end1,Iterator2 begin2,Iterator2 end2,StreamingUtf8Validator::State expected)256   void CheckCombinations(Iterator1 begin1,
257                          Iterator1 end1,
258                          Iterator2 begin2,
259                          Iterator2 end2,
260                          StreamingUtf8Validator::State expected) {
261     StreamingUtf8Validator validator;
262     for (Iterator1 it1 = begin1; it1 != end1; ++it1) {
263       std::string_view c1 = *it1;
264       for (Iterator2 it2 = begin2; it2 != end2; ++it2) {
265         std::string_view c2 = *it2;
266         validator.AddBytes(base::as_byte_span(c1));
267         EXPECT_EQ(expected, validator.AddBytes(base::as_byte_span(c2)))
268             << "Failed for \"" << c1 << c2 << "\"";
269         validator.Reset();
270       }
271     }
272   }
273 };
274 
TEST(StreamingUtf8ValidatorTest,NothingIsValid)275 TEST(StreamingUtf8ValidatorTest, NothingIsValid) {
276   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes({}));
277 }
278 
279 // Because the members of the |valid| array need to be non-zero length
280 // sequences and are measured with strlen(), |valid| cannot be used it
281 // to test the NUL character '\0', so the NUL character gets its own
282 // test.
TEST(StreamingUtf8ValidatorTest,NulIsValid)283 TEST(StreamingUtf8ValidatorTest, NulIsValid) {
284   static const char kNul[] = "\x00";
285   EXPECT_EQ(VALID_ENDPOINT,
286             StreamingUtf8Validator().AddBytes(byte_span_from_cstring(kNul)));
287 }
288 
289 // Just a basic sanity test before we start getting fancy.
TEST(StreamingUtf8ValidatorTest,HelloWorld)290 TEST(StreamingUtf8ValidatorTest, HelloWorld) {
291   static const char kHelloWorld[] = "Hello, World!";
292   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(
293                                 byte_span_from_cstring(kHelloWorld)));
294 }
295 
296 // Check that the Reset() method works.
TEST(StreamingUtf8ValidatorTest,ResetWorks)297 TEST(StreamingUtf8ValidatorTest, ResetWorks) {
298   StreamingUtf8Validator validator;
299   EXPECT_EQ(INVALID, validator.AddBytes(byte_span_from_cstring("\xC0")));
300   EXPECT_EQ(INVALID, validator.AddBytes(byte_span_from_cstring("a")));
301   validator.Reset();
302   EXPECT_EQ(VALID_ENDPOINT, validator.AddBytes(byte_span_from_cstring("a")));
303 }
304 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,Valid)305 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) {
306   CheckRange(valid, valid_end, VALID_ENDPOINT);
307 }
308 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,Partial)309 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) {
310   CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
311 }
312 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,Invalid)313 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) {
314   CheckRange(invalid, invalid_end, INVALID);
315 }
316 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,ValidByByte)317 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) {
318   CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT);
319 }
320 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,PartialByByte)321 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) {
322   CheckRangeByteAtATime(
323       PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
324 }
325 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,InvalidByByte)326 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) {
327   CheckRangeByteAtATime(invalid, invalid_end, INVALID);
328 }
329 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,ValidPlusValidIsValid)330 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) {
331   CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT);
332 }
333 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,ValidPlusPartialIsPartial)334 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) {
335   CheckCombinations(valid,
336                     valid_end,
337                     PartialIterator(),
338                     PartialIterator::end(),
339                     VALID_MIDPOINT);
340 }
341 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,PartialPlusValidIsInvalid)342 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) {
343   CheckCombinations(
344       PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID);
345 }
346 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,PartialPlusPartialIsInvalid)347 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) {
348   CheckCombinations(PartialIterator(),
349                     PartialIterator::end(),
350                     PartialIterator(),
351                     PartialIterator::end(),
352                     INVALID);
353 }
354 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,ValidPlusInvalidIsInvalid)355 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) {
356   CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID);
357 }
358 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,InvalidPlusValidIsInvalid)359 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) {
360   CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID);
361 }
362 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,InvalidPlusInvalidIsInvalid)363 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) {
364   CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID);
365 }
366 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,InvalidPlusPartialIsInvalid)367 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) {
368   CheckCombinations(
369       invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID);
370 }
371 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,PartialPlusInvalidIsInvalid)372 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) {
373   CheckCombinations(
374       PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID);
375 }
376 
TEST(StreamingUtf8ValidatorValidateTest,EmptyIsValid)377 TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) {
378   EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string()));
379 }
380 
TEST(StreamingUtf8ValidatorValidateTest,SimpleValidCase)381 TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) {
382   EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81"));
383 }
384 
TEST(StreamingUtf8ValidatorValidateTest,SimpleInvalidCase)385 TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) {
386   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80"));
387 }
388 
TEST(StreamingUtf8ValidatorValidateTest,TruncatedIsInvalid)389 TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) {
390   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2"));
391 }
392 
393 }  // namespace
394 }  // namespace base
395