1 // Copyright 2014 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // This implementation doesn't use ICU. The ICU macros are oriented towards 6 // character-at-a-time processing, whereas byte-at-a-time processing is easier 7 // with streaming input. 8 9 #include "base/i18n/streaming_utf8_validator.h" 10 11 #include "base/i18n/utf8_validator_tables.h" 12 #include "base/logging.h" 13 14 namespace base { 15 namespace { 16 StateTableLookup(uint8 offset)17uint8 StateTableLookup(uint8 offset) { 18 DCHECK_LT(offset, internal::kUtf8ValidatorTablesSize); 19 return internal::kUtf8ValidatorTables[offset]; 20 } 21 22 } // namespace 23 AddBytes(const char * data,size_t size)24StreamingUtf8Validator::State StreamingUtf8Validator::AddBytes(const char* data, 25 size_t size) { 26 // Copy |state_| into a local variable so that the compiler doesn't have to be 27 // careful of aliasing. 28 uint8 state = state_; 29 for (const char* p = data; p != data + size; ++p) { 30 if ((*p & 0x80) == 0) { 31 if (state == 0) 32 continue; 33 state = internal::I18N_UTF8_VALIDATOR_INVALID_INDEX; 34 break; 35 } 36 const uint8 shift_amount = StateTableLookup(state); 37 const uint8 shifted_char = (*p & 0x7F) >> shift_amount; 38 state = StateTableLookup(state + shifted_char + 1); 39 // State may be INVALID here, but this code is optimised for the case of 40 // valid UTF-8 and it is more efficient (by about 2%) to not attempt an 41 // early loop exit unless we hit an ASCII character. 42 } 43 state_ = state; 44 return state == 0 ? VALID_ENDPOINT 45 : state == internal::I18N_UTF8_VALIDATOR_INVALID_INDEX 46 ? INVALID 47 : VALID_MIDPOINT; 48 } 49 Reset()50void StreamingUtf8Validator::Reset() { 51 state_ = 0u; 52 } 53 Validate(const std::string & string)54bool StreamingUtf8Validator::Validate(const std::string& string) { 55 return StreamingUtf8Validator().AddBytes(string.data(), string.size()) == 56 VALID_ENDPOINT; 57 } 58 59 } // namespace base 60