• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/i18n/streaming_utf8_validator.h"
6 
7 #include <stdio.h>
8 #include <string.h>
9 
10 #include <string>
11 
12 #include "base/strings/string_piece.h"
13 #include "testing/gtest/include/gtest/gtest.h"
14 
15 // Define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST to verify that this class
16 // accepts exactly the same set of 4-byte strings as ICU-based validation. This
17 // tests every possible 4-byte string, so it is too slow to run routinely on
18 // low-powered machines.
19 //
20 // #define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
21 
22 #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
23 
24 #include "base/basictypes.h"
25 #include "base/bind.h"
26 #include "base/location.h"
27 #include "base/logging.h"
28 #include "base/memory/ref_counted.h"
29 #include "base/strings/string_util.h"
30 #include "base/strings/stringprintf.h"
31 #include "base/strings/utf_string_conversion_utils.h"
32 #include "base/synchronization/condition_variable.h"
33 #include "base/synchronization/lock.h"
34 #include "base/threading/sequenced_worker_pool.h"
35 #include "third_party/icu/source/common/unicode/utf8.h"
36 
37 #endif  // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
38 
39 namespace base {
40 namespace {
41 
42 // Avoid having to qualify the enum values in the tests.
43 const StreamingUtf8Validator::State VALID_ENDPOINT =
44     StreamingUtf8Validator::VALID_ENDPOINT;
45 const StreamingUtf8Validator::State VALID_MIDPOINT =
46     StreamingUtf8Validator::VALID_MIDPOINT;
47 const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID;
48 
49 #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
50 
51 const uint32 kThoroughTestChunkSize = 1 << 24;
52 
53 class StreamingUtf8ValidatorThoroughTest : public ::testing::Test {
54  protected:
StreamingUtf8ValidatorThoroughTest()55   StreamingUtf8ValidatorThoroughTest()
56       : all_done_(&lock_), tasks_dispatched_(0), tasks_finished_(0) {}
57 
58   // This uses the same logic as base::IsStringUTF8 except it considers
59   // non-characters valid (and doesn't require a string as input).
IsStringUtf8(const char * src,int32 src_len)60   static bool IsStringUtf8(const char* src, int32 src_len) {
61     int32 char_index = 0;
62 
63     while (char_index < src_len) {
64       int32 code_point;
65       U8_NEXT(src, char_index, src_len, code_point);
66       if (!base::IsValidCodepoint(code_point))
67         return false;
68     }
69     return true;
70   }
71 
72   // Converts the passed-in integer to a 4 byte string and then
73   // verifies that IsStringUtf8 and StreamingUtf8Validator agree on
74   // whether it is valid UTF-8 or not.
TestNumber(uint32 n) const75   void TestNumber(uint32 n) const {
76     char test[sizeof n];
77     memcpy(test, &n, sizeof n);
78     StreamingUtf8Validator validator;
79     EXPECT_EQ(IsStringUtf8(test, sizeof n),
80               validator.AddBytes(test, sizeof n) == VALID_ENDPOINT)
81         << "Difference of opinion for \""
82         << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X",
83                               test[0] & 0xFF,
84                               test[1] & 0xFF,
85                               test[2] & 0xFF,
86                               test[3] & 0xFF) << "\"";
87   }
88 
89  public:
90   // Tests the 4-byte sequences corresponding to the |size| integers
91   // starting at |begin|. This is intended to be run from a worker
92   // pool. Signals |all_done_| at the end if it thinks all tasks are
93   // finished.
TestRange(uint32 begin,uint32 size)94   void TestRange(uint32 begin, uint32 size) {
95     for (uint32 i = 0; i < size; ++i) {
96       TestNumber(begin + i);
97     }
98     base::AutoLock al(lock_);
99     ++tasks_finished_;
100     LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_
101               << " tasks done\n";
102     if (tasks_finished_ >= tasks_dispatched_) {
103       all_done_.Signal();
104     }
105   }
106 
107  protected:
108   base::Lock lock_;
109   base::ConditionVariable all_done_;
110   int tasks_dispatched_;
111   int tasks_finished_;
112 };
113 
TEST_F(StreamingUtf8ValidatorThoroughTest,TestEverything)114 TEST_F(StreamingUtf8ValidatorThoroughTest, TestEverything) {
115   scoped_refptr<base::SequencedWorkerPool> pool =
116       new base::SequencedWorkerPool(32, "TestEverything");
117   base::AutoLock al(lock_);
118   uint32 begin = 0;
119   do {
120     pool->PostWorkerTask(
121         FROM_HERE,
122         base::Bind(&StreamingUtf8ValidatorThoroughTest::TestRange,
123                    base::Unretained(this),
124                    begin,
125                    kThoroughTestChunkSize));
126     ++tasks_dispatched_;
127     begin += kThoroughTestChunkSize;
128   } while (begin != 0);
129   while (tasks_finished_ < tasks_dispatched_)
130     all_done_.Wait();
131 }
132 
133 #endif  // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
134 
135 // These valid and invalid UTF-8 sequences are based on the tests from
136 // base/strings/string_util_unittest.cc
137 
138 // All of the strings in |valid| must represent a single codepoint, because
139 // partial sequences are constructed by taking non-empty prefixes of these
140 // strings.
141 const char* const valid[] = {"\r",           "\n",           "a",
142                              "\xc2\x81",     "\xe1\x80\xbf", "\xf1\x80\xa0\xbf",
143                              "\xef\xbb\xbf",  // UTF-8 BOM
144 };
145 
146 const char* const* const valid_end = valid + arraysize(valid);
147 
148 const char* const invalid[] = {
149     // always invalid bytes
150     "\xc0", "\xc1",
151     "\xf5", "\xf6", "\xf7",
152     "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff",
153     // surrogate code points
154     "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf",
155     //
156     // overlong sequences
157     "\xc0\x80"               // U+0000
158     "\xc1\x80",              // "A"
159     "\xc1\x81",              // "B"
160     "\xe0\x80\x80",          // U+0000
161     "\xe0\x82\x80",          // U+0080
162     "\xe0\x9f\xbf",          // U+07ff
163     "\xf0\x80\x80\x8D",      // U+000D
164     "\xf0\x80\x82\x91",      // U+0091
165     "\xf0\x80\xa0\x80",      // U+0800
166     "\xf0\x8f\xbb\xbf",      // U+FEFF (BOM)
167     "\xf8\x80\x80\x80\xbf",  // U+003F
168     "\xfc\x80\x80\x80\xa0\xa5",
169     //
170     // Beyond U+10FFFF
171     "\xf4\x90\x80\x80",          // U+110000
172     "\xf8\xa0\xbf\x80\xbf",      // 5 bytes
173     "\xfc\x9c\xbf\x80\xbf\x80",  // 6 bytes
174     //
175     // BOMs in UTF-16(BE|LE)
176     "\xfe\xff", "\xff\xfe",
177 };
178 
179 const char* const* const invalid_end = invalid + arraysize(invalid);
180 
181 // A ForwardIterator which returns all the non-empty prefixes of the elements of
182 // "valid".
183 class PartialIterator {
184  public:
185   // The constructor returns the first iterator, ie. it is equivalent to
186   // begin().
PartialIterator()187   PartialIterator() : index_(0), prefix_length_(0) { Advance(); }
188   // The trivial destructor left intentionally undefined.
189   // This is a value type; the default copy constructor and assignment operator
190   // generated by the compiler are used.
191 
end()192   static PartialIterator end() { return PartialIterator(arraysize(valid), 1); }
193 
operator ++()194   PartialIterator& operator++() {
195     Advance();
196     return *this;
197   }
198 
operator *() const199   base::StringPiece operator*() const {
200     return base::StringPiece(valid[index_], prefix_length_);
201   }
202 
operator ==(const PartialIterator & rhs) const203   bool operator==(const PartialIterator& rhs) const {
204     return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_;
205   }
206 
operator !=(const PartialIterator & rhs) const207   bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); }
208 
209  private:
210   // This constructor is used by the end() method.
PartialIterator(size_t index,size_t prefix_length)211   PartialIterator(size_t index, size_t prefix_length)
212       : index_(index), prefix_length_(prefix_length) {}
213 
Advance()214   void Advance() {
215     if (index_ < arraysize(valid) && prefix_length_ < strlen(valid[index_]))
216       ++prefix_length_;
217     while (index_ < arraysize(valid) &&
218            prefix_length_ == strlen(valid[index_])) {
219       ++index_;
220       prefix_length_ = 1;
221     }
222   }
223 
224   // The UTF-8 sequence, as an offset into the |valid| array.
225   size_t index_;
226   size_t prefix_length_;
227 };
228 
229 // A test fixture for tests which test one UTF-8 sequence (or invalid
230 // byte sequence) at a time.
231 class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test {
232  protected:
233   // Iterator must be convertible when de-referenced to StringPiece.
234   template <typename Iterator>
CheckRange(Iterator begin,Iterator end,StreamingUtf8Validator::State expected)235   void CheckRange(Iterator begin,
236                   Iterator end,
237                   StreamingUtf8Validator::State expected) {
238     for (Iterator it = begin; it != end; ++it) {
239       StreamingUtf8Validator validator;
240       base::StringPiece sequence = *it;
241       EXPECT_EQ(expected,
242                 validator.AddBytes(sequence.data(), sequence.size()))
243           << "Failed for \"" << sequence << "\"";
244     }
245   }
246 
247   // Adding input a byte at a time should make absolutely no difference.
248   template <typename Iterator>
CheckRangeByteAtATime(Iterator begin,Iterator end,StreamingUtf8Validator::State expected)249   void CheckRangeByteAtATime(Iterator begin,
250                              Iterator end,
251                              StreamingUtf8Validator::State expected) {
252     for (Iterator it = begin; it != end; ++it) {
253       StreamingUtf8Validator validator;
254       base::StringPiece sequence = *it;
255       StreamingUtf8Validator::State state = VALID_ENDPOINT;
256       for (base::StringPiece::const_iterator cit = sequence.begin();
257            cit != sequence.end();
258            ++cit) {
259         state = validator.AddBytes(&*cit, 1);
260       }
261       EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\"";
262     }
263   }
264 };
265 
266 // A test fixture for tests which test the concatenation of byte sequences.
267 class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test {
268  protected:
269   // Check every possible concatenation of byte sequences from two
270   // ranges, and verify that the combination matches the expected
271   // state.
272   template <typename Iterator1, typename Iterator2>
CheckCombinations(Iterator1 begin1,Iterator1 end1,Iterator2 begin2,Iterator2 end2,StreamingUtf8Validator::State expected)273   void CheckCombinations(Iterator1 begin1,
274                          Iterator1 end1,
275                          Iterator2 begin2,
276                          Iterator2 end2,
277                          StreamingUtf8Validator::State expected) {
278     StreamingUtf8Validator validator;
279     for (Iterator1 it1 = begin1; it1 != end1; ++it1) {
280       base::StringPiece c1 = *it1;
281       for (Iterator2 it2 = begin2; it2 != end2; ++it2) {
282         base::StringPiece c2 = *it2;
283         validator.AddBytes(c1.data(), c1.size());
284         EXPECT_EQ(expected, validator.AddBytes(c2.data(), c2.size()))
285             << "Failed for \"" << c1 << c2 << "\"";
286         validator.Reset();
287       }
288     }
289   }
290 };
291 
TEST(StreamingUtf8ValidatorTest,NothingIsValid)292 TEST(StreamingUtf8ValidatorTest, NothingIsValid) {
293   static const char kNothing[] = "";
294   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNothing, 0));
295 }
296 
297 // Because the members of the |valid| array need to be non-zero length
298 // sequences and are measured with strlen(), |valid| cannot be used it
299 // to test the NUL character '\0', so the NUL character gets its own
300 // test.
TEST(StreamingUtf8ValidatorTest,NulIsValid)301 TEST(StreamingUtf8ValidatorTest, NulIsValid) {
302   static const char kNul[] = "\x00";
303   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNul, 1));
304 }
305 
306 // Just a basic sanity test before we start getting fancy.
TEST(StreamingUtf8ValidatorTest,HelloWorld)307 TEST(StreamingUtf8ValidatorTest, HelloWorld) {
308   static const char kHelloWorld[] = "Hello, World!";
309   EXPECT_EQ(
310       VALID_ENDPOINT,
311       StreamingUtf8Validator().AddBytes(kHelloWorld, strlen(kHelloWorld)));
312 }
313 
314 // Check that the Reset() method works.
TEST(StreamingUtf8ValidatorTest,ResetWorks)315 TEST(StreamingUtf8ValidatorTest, ResetWorks) {
316   StreamingUtf8Validator validator;
317   EXPECT_EQ(INVALID, validator.AddBytes("\xC0", 1));
318   EXPECT_EQ(INVALID, validator.AddBytes("a", 1));
319   validator.Reset();
320   EXPECT_EQ(VALID_ENDPOINT, validator.AddBytes("a", 1));
321 }
322 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,Valid)323 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) {
324   CheckRange(valid, valid_end, VALID_ENDPOINT);
325 }
326 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,Partial)327 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) {
328   CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
329 }
330 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,Invalid)331 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) {
332   CheckRange(invalid, invalid_end, INVALID);
333 }
334 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,ValidByByte)335 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) {
336   CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT);
337 }
338 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,PartialByByte)339 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) {
340   CheckRangeByteAtATime(
341       PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
342 }
343 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,InvalidByByte)344 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) {
345   CheckRangeByteAtATime(invalid, invalid_end, INVALID);
346 }
347 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,ValidPlusValidIsValid)348 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) {
349   CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT);
350 }
351 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,ValidPlusPartialIsPartial)352 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) {
353   CheckCombinations(valid,
354                     valid_end,
355                     PartialIterator(),
356                     PartialIterator::end(),
357                     VALID_MIDPOINT);
358 }
359 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,PartialPlusValidIsInvalid)360 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) {
361   CheckCombinations(
362       PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID);
363 }
364 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,PartialPlusPartialIsInvalid)365 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) {
366   CheckCombinations(PartialIterator(),
367                     PartialIterator::end(),
368                     PartialIterator(),
369                     PartialIterator::end(),
370                     INVALID);
371 }
372 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,ValidPlusInvalidIsInvalid)373 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) {
374   CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID);
375 }
376 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,InvalidPlusValidIsInvalid)377 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) {
378   CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID);
379 }
380 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,InvalidPlusInvalidIsInvalid)381 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) {
382   CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID);
383 }
384 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,InvalidPlusPartialIsInvalid)385 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) {
386   CheckCombinations(
387       invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID);
388 }
389 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,PartialPlusInvalidIsInvalid)390 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) {
391   CheckCombinations(
392       PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID);
393 }
394 
TEST(StreamingUtf8ValidatorValidateTest,EmptyIsValid)395 TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) {
396   EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string()));
397 }
398 
TEST(StreamingUtf8ValidatorValidateTest,SimpleValidCase)399 TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) {
400   EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81"));
401 }
402 
TEST(StreamingUtf8ValidatorValidateTest,SimpleInvalidCase)403 TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) {
404   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80"));
405 }
406 
TEST(StreamingUtf8ValidatorValidateTest,TruncatedIsInvalid)407 TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) {
408   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2"));
409 }
410 
411 }  // namespace
412 }  // namespace base
413