1// 2// Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com) 3// 4// Distributed under the Boost Software License, Version 1.0. (See accompanying 5// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 6// 7// Official repository: https://github.com/boostorg/beast 8// 9 10#ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP 11#define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP 12 13#include <boost/beast/websocket/detail/utf8_checker.hpp> 14 15#include <boost/assert.hpp> 16 17namespace boost { 18namespace beast { 19namespace websocket { 20namespace detail { 21 22void 23utf8_checker:: 24reset() 25{ 26 need_ = 0; 27 p_ = cp_; 28} 29 30bool 31utf8_checker:: 32finish() 33{ 34 auto const success = need_ == 0; 35 reset(); 36 return success; 37} 38 39bool 40utf8_checker:: 41write(std::uint8_t const* in, std::size_t size) 42{ 43 auto const valid = 44 [](std::uint8_t const*& p) 45 { 46 if(p[0] < 128) 47 { 48 ++p; 49 return true; 50 } 51 if((p[0] & 0xe0) == 0xc0) 52 { 53 if( (p[1] & 0xc0) != 0x80 || 54 (p[0] & 0x1e) == 0) // overlong 55 return false; 56 p += 2; 57 return true; 58 } 59 if((p[0] & 0xf0) == 0xe0) 60 { 61 if( (p[1] & 0xc0) != 0x80 62 || (p[2] & 0xc0) != 0x80 63 || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong 64 || (p[0] == 0xed && (p[1] & 0x20) == 0x20) // surrogate 65 //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF 66 ) 67 return false; 68 p += 3; 69 return true; 70 } 71 if((p[0] & 0xf8) == 0xf0) 72 { 73 if( (p[0] & 0x07) >= 0x05 // invalid F5...FF characters 74 || (p[1] & 0xc0) != 0x80 75 || (p[2] & 0xc0) != 0x80 76 || (p[3] & 0xc0) != 0x80 77 || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong 78 || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4 // > U+10FFFF 79 ) 80 return false; 81 p += 4; 82 return true; 83 } 84 return false; 85 }; 86 auto const fail_fast = 87 [&]() 88 { 89 if(cp_[0] < 128) 90 { 91 return false; 92 } 93 94 const auto& p = cp_; // alias, only to keep this code similar to valid() above 95 const auto known_only = p_ - cp_; 96 if (known_only == 1) 97 { 98 if((p[0] & 0xe0) == 0xc0) 99 { 100 return ((p[0] & 0x1e) == 0); // overlong 101 } 102 if((p[0] & 0xf0) == 0xe0) 103 { 104 return false; 105 } 106 if((p[0] & 0xf8) == 0xf0) 107 { 108 return ((p[0] & 0x07) >= 0x05); // invalid F5...FF characters 109 } 110 } 111 else if (known_only == 2) 112 { 113 if((p[0] & 0xe0) == 0xc0) 114 { 115 return ((p[1] & 0xc0) != 0x80 || 116 (p[0] & 0x1e) == 0); // overlong 117 } 118 if((p[0] & 0xf0) == 0xe0) 119 { 120 return ( (p[1] & 0xc0) != 0x80 121 || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong 122 || (p[0] == 0xed && (p[1] & 0x20) == 0x20)); // surrogate 123 } 124 if((p[0] & 0xf8) == 0xf0) 125 { 126 return ( (p[0] & 0x07) >= 0x05 // invalid F5...FF characters 127 || (p[1] & 0xc0) != 0x80 128 || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong 129 || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4); // > U+10FFFF 130 } 131 } 132 else if (known_only == 3) 133 { 134 if((p[0] & 0xe0) == 0xc0) 135 { 136 return ( (p[1] & 0xc0) != 0x80 137 || (p[0] & 0x1e) == 0); // overlong 138 } 139 if((p[0] & 0xf0) == 0xe0) 140 { 141 return ( (p[1] & 0xc0) != 0x80 142 || (p[2] & 0xc0) != 0x80 143 || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong 144 || (p[0] == 0xed && (p[1] & 0x20) == 0x20)); // surrogate 145 //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF 146 } 147 if((p[0] & 0xf8) == 0xf0) 148 { 149 return ( (p[0] & 0x07) >= 0x05 // invalid F5...FF characters 150 || (p[1] & 0xc0) != 0x80 151 || (p[2] & 0xc0) != 0x80 152 || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong 153 || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4); // > U+10FFFF 154 } 155 } 156 return true; 157 }; 158 auto const needed = 159 [](std::uint8_t const v) 160 { 161 if(v < 128) 162 return 1; 163 if(v < 192) 164 return 0; 165 if(v < 224) 166 return 2; 167 if(v < 240) 168 return 3; 169 if(v < 248) 170 return 4; 171 return 0; 172 }; 173 174 auto const end = in + size; 175 176 // Finish up any incomplete code point 177 if(need_ > 0) 178 { 179 // Calculate what we have 180 auto n = (std::min)(size, need_); 181 size -= n; 182 need_ -= n; 183 184 // Add characters to the code point 185 while(n--) 186 *p_++ = *in++; 187 BOOST_ASSERT(p_ <= cp_ + 4); 188 189 // Still incomplete? 190 if(need_ > 0) 191 { 192 // Incomplete code point 193 BOOST_ASSERT(in == end); 194 195 // Do partial validation on the incomplete 196 // code point, this is called "Fail fast" 197 // in Autobahn|Testsuite parlance. 198 return ! fail_fast(); 199 } 200 201 // Complete code point, validate it 202 std::uint8_t const* p = &cp_[0]; 203 if(! valid(p)) 204 return false; 205 p_ = cp_; 206 } 207 208 if(size <= sizeof(std::size_t)) 209 goto slow; 210 211 // Align `in` to sizeof(std::size_t) boundary 212 { 213 auto const in0 = in; 214 auto last = reinterpret_cast<std::uint8_t const*>( 215 ((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) / 216 sizeof(std::size_t)) * sizeof(std::size_t)); 217 218 // Check one character at a time for low-ASCII 219 while(in < last) 220 { 221 if(*in & 0x80) 222 { 223 // Not low-ASCII so switch to slow loop 224 size = size - (in - in0); 225 goto slow; 226 } 227 ++in; 228 } 229 size = size - (in - in0); 230 } 231 232 // Fast loop: Process 4 or 8 low-ASCII characters at a time 233 { 234 auto const in0 = in; 235 auto last = in + size - 7; 236 auto constexpr mask = static_cast< 237 std::size_t>(0x8080808080808080 & ~std::size_t{0}); 238 while(in < last) 239 { 240#if 0 241 std::size_t temp; 242 std::memcpy(&temp, in, sizeof(temp)); 243 if((temp & mask) != 0) 244#else 245 // Technically UB but works on all known platforms 246 if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0) 247#endif 248 { 249 size = size - (in - in0); 250 goto slow; 251 } 252 in += sizeof(std::size_t); 253 } 254 // There's at least one more full code point left 255 last += 4; 256 while(in < last) 257 if(! valid(in)) 258 return false; 259 goto tail; 260 } 261 262slow: 263 // Slow loop: Full validation on one code point at a time 264 { 265 auto last = in + size - 3; 266 while(in < last) 267 if(! valid(in)) 268 return false; 269 } 270 271tail: 272 // Handle the remaining bytes. The last 273 // characters could split a code point so 274 // we save the partial code point for later. 275 // 276 // On entry to the loop, `in` points to the 277 // beginning of a code point. 278 // 279 for(;;) 280 { 281 // Number of chars left 282 auto n = end - in; 283 if(! n) 284 break; 285 286 // Chars we need to finish this code point 287 auto const need = needed(*in); 288 if(need == 0) 289 return false; 290 if(need <= n) 291 { 292 // Check a whole code point 293 if(! valid(in)) 294 return false; 295 } 296 else 297 { 298 // Calculate how many chars we need 299 // to finish this partial code point 300 need_ = need - n; 301 302 // Save the partial code point 303 while(n--) 304 *p_++ = *in++; 305 BOOST_ASSERT(in == end); 306 BOOST_ASSERT(p_ <= cp_ + 4); 307 308 // Do partial validation on the incomplete 309 // code point, this is called "Fail fast" 310 // in Autobahn|Testsuite parlance. 311 return ! fail_fast(); 312 } 313 } 314 return true; 315} 316 317bool 318check_utf8(char const* p, std::size_t n) 319{ 320 utf8_checker c; 321 if(! c.write(reinterpret_cast<const uint8_t*>(p), n)) 322 return false; 323 return c.finish(); 324} 325 326} // detail 327} // websocket 328} // beast 329} // boost 330 331#endif // BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP 332