• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2014 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/i18n/streaming_utf8_validator.h"
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <stdio.h>
10 #include <string.h>
11 
12 #include <string>
13 
14 #include "base/functional/bind.h"
15 #include "base/location.h"
16 #include "base/logging.h"
17 #include "base/memory/ref_counted.h"
18 #include "base/strings/string_piece.h"
19 #include "base/strings/string_util.h"
20 #include "base/strings/stringprintf.h"
21 #include "base/strings/utf_string_conversion_utils.h"
22 #include "base/synchronization/lock.h"
23 #include "base/task/thread_pool.h"
24 #include "base/test/task_environment.h"
25 #include "testing/gtest/include/gtest/gtest.h"
26 #include "third_party/icu/source/common/unicode/utf8.h"
27 
28 namespace base {
29 namespace {
30 
31 // Avoid having to qualify the enum values in the tests.
32 const StreamingUtf8Validator::State VALID_ENDPOINT =
33     StreamingUtf8Validator::VALID_ENDPOINT;
34 const StreamingUtf8Validator::State VALID_MIDPOINT =
35     StreamingUtf8Validator::VALID_MIDPOINT;
36 const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID;
37 
38 const uint32_t kThoroughTestChunkSize = 1 << 24;
39 
40 class StreamingUtf8ValidatorThoroughTest : public ::testing::Test {
41  protected:
StreamingUtf8ValidatorThoroughTest()42   StreamingUtf8ValidatorThoroughTest()
43       : tasks_dispatched_(0), tasks_finished_(0) {}
44 
45   // This uses the same logic as base::IsStringUTF8 except it considers
46   // non-characters valid (and doesn't require a string as input).
IsStringUtf8(const char * src,int32_t src_len)47   static bool IsStringUtf8(const char* src, int32_t src_len) {
48     int32_t char_index = 0;
49 
50     while (char_index < src_len) {
51       base_icu::UChar32 code_point;
52       U8_NEXT(src, char_index, src_len, code_point);
53       if (!base::IsValidCodepoint(code_point))
54         return false;
55     }
56     return true;
57   }
58 
59   // Converts the passed-in integer to a 4 byte string and then
60   // verifies that IsStringUtf8 and StreamingUtf8Validator agree on
61   // whether it is valid UTF-8 or not.
TestNumber(uint32_t n) const62   void TestNumber(uint32_t n) const {
63     char test[sizeof n];
64     memcpy(test, &n, sizeof n);
65     StreamingUtf8Validator validator;
66     EXPECT_EQ(IsStringUtf8(test, sizeof n),
67               validator.AddBytes(test, sizeof n) == VALID_ENDPOINT)
68         << "Difference of opinion for \""
69         << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X",
70                               test[0] & 0xFF,
71                               test[1] & 0xFF,
72                               test[2] & 0xFF,
73                               test[3] & 0xFF) << "\"";
74   }
75 
76  public:
77   // Tests the 4-byte sequences corresponding to the |size| integers
78   // starting at |begin|. This is intended to be run from a worker
79   // pool. Signals |all_done_| at the end if it thinks all tasks are
80   // finished.
TestRange(uint32_t begin,uint32_t size)81   void TestRange(uint32_t begin, uint32_t size) {
82     for (uint32_t i = 0; i < size; ++i) {
83       TestNumber(begin + i);
84     }
85     base::AutoLock al(lock_);
86     ++tasks_finished_;
87     LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_
88               << " tasks done\n";
89   }
90 
91  protected:
92   base::Lock lock_;
93   int tasks_dispatched_;
94   int tasks_finished_;
95 };
96 
97 // Enable locally to verify that this class accepts exactly the same set of
98 // 4-byte strings as ICU-based validation. This tests every possible 4-byte
99 // string, so it is too slow to run routinely on low-powered machines.
TEST_F(StreamingUtf8ValidatorThoroughTest,DISABLED_TestEverything)100 TEST_F(StreamingUtf8ValidatorThoroughTest, DISABLED_TestEverything) {
101   base::test::TaskEnvironment task_environment;
102   {
103     base::AutoLock al(lock_);
104     uint32_t begin = 0;
105     do {
106       base::ThreadPool::PostTask(
107           FROM_HERE, {base::TaskShutdownBehavior::BLOCK_SHUTDOWN},
108           base::BindOnce(&StreamingUtf8ValidatorThoroughTest::TestRange,
109                          base::Unretained(this), begin,
110                          kThoroughTestChunkSize));
111       ++tasks_dispatched_;
112       begin += kThoroughTestChunkSize;
113     } while (begin != 0);
114   }
115 }
116 
117 // These valid and invalid UTF-8 sequences are based on the tests from
118 // base/strings/string_util_unittest.cc
119 
120 // All of the strings in |valid| must represent a single codepoint, because
121 // partial sequences are constructed by taking non-empty prefixes of these
122 // strings.
123 const char* const valid[] = {"\r",           "\n",           "a",
124                              "\xc2\x81",     "\xe1\x80\xbf", "\xf1\x80\xa0\xbf",
125                              "\xef\xbb\xbf",  // UTF-8 BOM
126 };
127 
128 const char* const* const valid_end = valid + std::size(valid);
129 
130 const char* const invalid[] = {
131     // always invalid bytes
132     "\xc0", "\xc1",
133     "\xf5", "\xf6", "\xf7",
134     "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff",
135     // surrogate code points
136     "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf",
137     //
138     // overlong sequences
139     "\xc0\x80",              // U+0000
140     "\xc1\x80",              // "A"
141     "\xc1\x81",              // "B"
142     "\xe0\x80\x80",          // U+0000
143     "\xe0\x82\x80",          // U+0080
144     "\xe0\x9f\xbf",          // U+07ff
145     "\xf0\x80\x80\x8D",      // U+000D
146     "\xf0\x80\x82\x91",      // U+0091
147     "\xf0\x80\xa0\x80",      // U+0800
148     "\xf0\x8f\xbb\xbf",      // U+FEFF (BOM)
149     "\xf8\x80\x80\x80\xbf",  // U+003F
150     "\xfc\x80\x80\x80\xa0\xa5",
151     //
152     // Beyond U+10FFFF
153     "\xf4\x90\x80\x80",          // U+110000
154     "\xf8\xa0\xbf\x80\xbf",      // 5 bytes
155     "\xfc\x9c\xbf\x80\xbf\x80",  // 6 bytes
156     //
157     // BOMs in UTF-16(BE|LE)
158     "\xfe\xff", "\xff\xfe",
159 };
160 
161 const char* const* const invalid_end = invalid + std::size(invalid);
162 
163 // A ForwardIterator which returns all the non-empty prefixes of the elements of
164 // "valid".
165 class PartialIterator {
166  public:
167   // The constructor returns the first iterator, ie. it is equivalent to
168   // begin().
PartialIterator()169   PartialIterator() : index_(0), prefix_length_(0) { Advance(); }
170   // The trivial destructor left intentionally undefined.
171   // This is a value type; the default copy constructor and assignment operator
172   // generated by the compiler are used.
173 
end()174   static PartialIterator end() { return PartialIterator(std::size(valid), 1); }
175 
operator ++()176   PartialIterator& operator++() {
177     Advance();
178     return *this;
179   }
180 
operator *() const181   base::StringPiece operator*() const {
182     return base::StringPiece(valid[index_], prefix_length_);
183   }
184 
operator ==(const PartialIterator & rhs) const185   bool operator==(const PartialIterator& rhs) const {
186     return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_;
187   }
188 
operator !=(const PartialIterator & rhs) const189   bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); }
190 
191  private:
192   // This constructor is used by the end() method.
PartialIterator(size_t index,size_t prefix_length)193   PartialIterator(size_t index, size_t prefix_length)
194       : index_(index), prefix_length_(prefix_length) {}
195 
Advance()196   void Advance() {
197     if (index_ < std::size(valid) && prefix_length_ < strlen(valid[index_]))
198       ++prefix_length_;
199     while (index_ < std::size(valid) &&
200            prefix_length_ == strlen(valid[index_])) {
201       ++index_;
202       prefix_length_ = 1;
203     }
204   }
205 
206   // The UTF-8 sequence, as an offset into the |valid| array.
207   size_t index_;
208   size_t prefix_length_;
209 };
210 
211 // A test fixture for tests which test one UTF-8 sequence (or invalid
212 // byte sequence) at a time.
213 class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test {
214  protected:
215   // Iterator must be convertible when de-referenced to StringPiece.
216   template <typename Iterator>
CheckRange(Iterator begin,Iterator end,StreamingUtf8Validator::State expected)217   void CheckRange(Iterator begin,
218                   Iterator end,
219                   StreamingUtf8Validator::State expected) {
220     for (Iterator it = begin; it != end; ++it) {
221       StreamingUtf8Validator validator;
222       base::StringPiece sequence = *it;
223       EXPECT_EQ(expected,
224                 validator.AddBytes(sequence.data(), sequence.size()))
225           << "Failed for \"" << sequence << "\"";
226     }
227   }
228 
229   // Adding input a byte at a time should make absolutely no difference.
230   template <typename Iterator>
CheckRangeByteAtATime(Iterator begin,Iterator end,StreamingUtf8Validator::State expected)231   void CheckRangeByteAtATime(Iterator begin,
232                              Iterator end,
233                              StreamingUtf8Validator::State expected) {
234     for (Iterator it = begin; it != end; ++it) {
235       StreamingUtf8Validator validator;
236       base::StringPiece sequence = *it;
237       StreamingUtf8Validator::State state = VALID_ENDPOINT;
238       for (const auto& cit : sequence) {
239         state = validator.AddBytes(&cit, 1);
240       }
241       EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\"";
242     }
243   }
244 };
245 
246 // A test fixture for tests which test the concatenation of byte sequences.
247 class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test {
248  protected:
249   // Check every possible concatenation of byte sequences from two
250   // ranges, and verify that the combination matches the expected
251   // state.
252   template <typename Iterator1, typename Iterator2>
CheckCombinations(Iterator1 begin1,Iterator1 end1,Iterator2 begin2,Iterator2 end2,StreamingUtf8Validator::State expected)253   void CheckCombinations(Iterator1 begin1,
254                          Iterator1 end1,
255                          Iterator2 begin2,
256                          Iterator2 end2,
257                          StreamingUtf8Validator::State expected) {
258     StreamingUtf8Validator validator;
259     for (Iterator1 it1 = begin1; it1 != end1; ++it1) {
260       base::StringPiece c1 = *it1;
261       for (Iterator2 it2 = begin2; it2 != end2; ++it2) {
262         base::StringPiece c2 = *it2;
263         validator.AddBytes(c1.data(), c1.size());
264         EXPECT_EQ(expected, validator.AddBytes(c2.data(), c2.size()))
265             << "Failed for \"" << c1 << c2 << "\"";
266         validator.Reset();
267       }
268     }
269   }
270 };
271 
TEST(StreamingUtf8ValidatorTest,NothingIsValid)272 TEST(StreamingUtf8ValidatorTest, NothingIsValid) {
273   static const char kNothing[] = "";
274   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNothing, 0));
275 }
276 
277 // Because the members of the |valid| array need to be non-zero length
278 // sequences and are measured with strlen(), |valid| cannot be used it
279 // to test the NUL character '\0', so the NUL character gets its own
280 // test.
TEST(StreamingUtf8ValidatorTest,NulIsValid)281 TEST(StreamingUtf8ValidatorTest, NulIsValid) {
282   static const char kNul[] = "\x00";
283   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNul, 1));
284 }
285 
286 // Just a basic sanity test before we start getting fancy.
TEST(StreamingUtf8ValidatorTest,HelloWorld)287 TEST(StreamingUtf8ValidatorTest, HelloWorld) {
288   static const char kHelloWorld[] = "Hello, World!";
289   EXPECT_EQ(
290       VALID_ENDPOINT,
291       StreamingUtf8Validator().AddBytes(kHelloWorld, strlen(kHelloWorld)));
292 }
293 
294 // Check that the Reset() method works.
TEST(StreamingUtf8ValidatorTest,ResetWorks)295 TEST(StreamingUtf8ValidatorTest, ResetWorks) {
296   StreamingUtf8Validator validator;
297   EXPECT_EQ(INVALID, validator.AddBytes("\xC0", 1));
298   EXPECT_EQ(INVALID, validator.AddBytes("a", 1));
299   validator.Reset();
300   EXPECT_EQ(VALID_ENDPOINT, validator.AddBytes("a", 1));
301 }
302 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,Valid)303 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) {
304   CheckRange(valid, valid_end, VALID_ENDPOINT);
305 }
306 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,Partial)307 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) {
308   CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
309 }
310 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,Invalid)311 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) {
312   CheckRange(invalid, invalid_end, INVALID);
313 }
314 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,ValidByByte)315 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) {
316   CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT);
317 }
318 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,PartialByByte)319 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) {
320   CheckRangeByteAtATime(
321       PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
322 }
323 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,InvalidByByte)324 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) {
325   CheckRangeByteAtATime(invalid, invalid_end, INVALID);
326 }
327 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,ValidPlusValidIsValid)328 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) {
329   CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT);
330 }
331 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,ValidPlusPartialIsPartial)332 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) {
333   CheckCombinations(valid,
334                     valid_end,
335                     PartialIterator(),
336                     PartialIterator::end(),
337                     VALID_MIDPOINT);
338 }
339 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,PartialPlusValidIsInvalid)340 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) {
341   CheckCombinations(
342       PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID);
343 }
344 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,PartialPlusPartialIsInvalid)345 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) {
346   CheckCombinations(PartialIterator(),
347                     PartialIterator::end(),
348                     PartialIterator(),
349                     PartialIterator::end(),
350                     INVALID);
351 }
352 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,ValidPlusInvalidIsInvalid)353 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) {
354   CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID);
355 }
356 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,InvalidPlusValidIsInvalid)357 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) {
358   CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID);
359 }
360 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,InvalidPlusInvalidIsInvalid)361 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) {
362   CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID);
363 }
364 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,InvalidPlusPartialIsInvalid)365 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) {
366   CheckCombinations(
367       invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID);
368 }
369 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,PartialPlusInvalidIsInvalid)370 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) {
371   CheckCombinations(
372       PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID);
373 }
374 
TEST(StreamingUtf8ValidatorValidateTest,EmptyIsValid)375 TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) {
376   EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string()));
377 }
378 
TEST(StreamingUtf8ValidatorValidateTest,SimpleValidCase)379 TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) {
380   EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81"));
381 }
382 
TEST(StreamingUtf8ValidatorValidateTest,SimpleInvalidCase)383 TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) {
384   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80"));
385 }
386 
TEST(StreamingUtf8ValidatorValidateTest,TruncatedIsInvalid)387 TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) {
388   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2"));
389 }
390 
391 }  // namespace
392 }  // namespace base
393