• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com)
3//
4// Distributed under the Boost Software License, Version 1.0. (See accompanying
5// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6//
7// Official repository: https://github.com/boostorg/beast
8//
9
10#ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP
11#define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP
12
13#include <boost/beast/websocket/detail/utf8_checker.hpp>
14
15#include <boost/assert.hpp>
16
17namespace boost {
18namespace beast {
19namespace websocket {
20namespace detail {
21
22void
23utf8_checker::
24reset()
25{
26    need_ = 0;
27    p_ = cp_;
28}
29
30bool
31utf8_checker::
32finish()
33{
34    auto const success = need_ == 0;
35    reset();
36    return success;
37}
38
39bool
40utf8_checker::
41write(std::uint8_t const* in, std::size_t size)
42{
43    auto const valid =
44        [](std::uint8_t const*& p)
45        {
46            if(p[0] < 128)
47            {
48                ++p;
49                return true;
50            }
51            if((p[0] & 0xe0) == 0xc0)
52            {
53                if( (p[1] & 0xc0) != 0x80 ||
54                    (p[0] & 0x1e) == 0)  // overlong
55                    return false;
56                p += 2;
57                return true;
58            }
59            if((p[0] & 0xf0) == 0xe0)
60            {
61                if(    (p[1] & 0xc0) != 0x80
62                    || (p[2] & 0xc0) != 0x80
63                    || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong
64                    || (p[0] == 0xed && (p[1] & 0x20) == 0x20) // surrogate
65                    //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
66                    )
67                    return false;
68                p += 3;
69                return true;
70            }
71            if((p[0] & 0xf8) == 0xf0)
72            {
73                if(    (p[0] & 0x07) >= 0x05 // invalid F5...FF characters
74                    || (p[1] & 0xc0) != 0x80
75                    || (p[2] & 0xc0) != 0x80
76                    || (p[3] & 0xc0) != 0x80
77                    || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong
78                    || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4 // > U+10FFFF
79                    )
80                    return false;
81                p += 4;
82                return true;
83            }
84            return false;
85        };
86    auto const fail_fast =
87        [&]()
88        {
89            if(cp_[0] < 128)
90            {
91                return false;
92            }
93
94            const auto& p = cp_; // alias, only to keep this code similar to valid() above
95            const auto known_only = p_ - cp_;
96            if (known_only == 1)
97            {
98                if((p[0] & 0xe0) == 0xc0)
99                {
100                    return ((p[0] & 0x1e) == 0);  // overlong
101                }
102                if((p[0] & 0xf0) == 0xe0)
103                {
104                    return false;
105                }
106                if((p[0] & 0xf8) == 0xf0)
107                {
108                    return ((p[0] & 0x07) >= 0x05);  // invalid F5...FF characters
109                }
110            }
111            else if (known_only == 2)
112            {
113                if((p[0] & 0xe0) == 0xc0)
114                {
115                    return ((p[1] & 0xc0) != 0x80 ||
116                            (p[0] & 0x1e) == 0);  // overlong
117                }
118                if((p[0] & 0xf0) == 0xe0)
119                {
120                    return (  (p[1] & 0xc0) != 0x80
121                           || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong
122                           || (p[0] == 0xed && (p[1] & 0x20) == 0x20)); // surrogate
123                }
124                if((p[0] & 0xf8) == 0xf0)
125                {
126                    return (  (p[0] & 0x07) >= 0x05 // invalid F5...FF characters
127                           || (p[1] & 0xc0) != 0x80
128                           || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong
129                           || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4); // > U+10FFFF
130                }
131            }
132            else if (known_only == 3)
133            {
134                if((p[0] & 0xe0) == 0xc0)
135                {
136                    return (  (p[1] & 0xc0) != 0x80
137                           || (p[0] & 0x1e) == 0);  // overlong
138                }
139                if((p[0] & 0xf0) == 0xe0)
140                {
141                    return (  (p[1] & 0xc0) != 0x80
142                           || (p[2] & 0xc0) != 0x80
143                           || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong
144                           || (p[0] == 0xed && (p[1] & 0x20) == 0x20)); // surrogate
145                           //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
146                }
147                if((p[0] & 0xf8) == 0xf0)
148                {
149                    return (  (p[0] & 0x07) >= 0x05 // invalid F5...FF characters
150                           || (p[1] & 0xc0) != 0x80
151                           || (p[2] & 0xc0) != 0x80
152                           || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong
153                           || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4); // > U+10FFFF
154                }
155            }
156            return true;
157        };
158    auto const needed =
159        [](std::uint8_t const v)
160        {
161            if(v < 128)
162                return 1;
163            if(v < 192)
164                return 0;
165            if(v < 224)
166                return 2;
167            if(v < 240)
168                return 3;
169            if(v < 248)
170                return 4;
171            return 0;
172        };
173
174    auto const end = in + size;
175
176    // Finish up any incomplete code point
177    if(need_ > 0)
178    {
179        // Calculate what we have
180        auto n = (std::min)(size, need_);
181        size -= n;
182        need_ -= n;
183
184        // Add characters to the code point
185        while(n--)
186            *p_++ = *in++;
187        BOOST_ASSERT(p_ <= cp_ + 4);
188
189        // Still incomplete?
190        if(need_ > 0)
191        {
192            // Incomplete code point
193            BOOST_ASSERT(in == end);
194
195            // Do partial validation on the incomplete
196            // code point, this is called "Fail fast"
197            // in Autobahn|Testsuite parlance.
198            return ! fail_fast();
199        }
200
201        // Complete code point, validate it
202        std::uint8_t const* p = &cp_[0];
203        if(! valid(p))
204            return false;
205        p_ = cp_;
206    }
207
208    if(size <= sizeof(std::size_t))
209        goto slow;
210
211    // Align `in` to sizeof(std::size_t) boundary
212    {
213        auto const in0 = in;
214        auto last = reinterpret_cast<std::uint8_t const*>(
215            ((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
216                sizeof(std::size_t)) * sizeof(std::size_t));
217
218        // Check one character at a time for low-ASCII
219        while(in < last)
220        {
221            if(*in & 0x80)
222            {
223                // Not low-ASCII so switch to slow loop
224                size = size - (in - in0);
225                goto slow;
226            }
227            ++in;
228        }
229        size = size - (in - in0);
230    }
231
232    // Fast loop: Process 4 or 8 low-ASCII characters at a time
233    {
234        auto const in0 = in;
235        auto last = in + size - 7;
236        auto constexpr mask = static_cast<
237            std::size_t>(0x8080808080808080 & ~std::size_t{0});
238        while(in < last)
239        {
240#if 0
241            std::size_t temp;
242            std::memcpy(&temp, in, sizeof(temp));
243            if((temp & mask) != 0)
244#else
245            // Technically UB but works on all known platforms
246            if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0)
247#endif
248            {
249                size = size - (in - in0);
250                goto slow;
251            }
252            in += sizeof(std::size_t);
253        }
254        // There's at least one more full code point left
255        last += 4;
256        while(in < last)
257            if(! valid(in))
258                return false;
259        goto tail;
260    }
261
262slow:
263    // Slow loop: Full validation on one code point at a time
264    {
265        auto last = in + size - 3;
266        while(in < last)
267            if(! valid(in))
268                return false;
269    }
270
271tail:
272    // Handle the remaining bytes. The last
273    // characters could split a code point so
274    // we save the partial code point for later.
275    //
276    // On entry to the loop, `in` points to the
277    // beginning of a code point.
278    //
279    for(;;)
280    {
281        // Number of chars left
282        auto n = end - in;
283        if(! n)
284            break;
285
286        // Chars we need to finish this code point
287        auto const need = needed(*in);
288        if(need == 0)
289            return false;
290        if(need <= n)
291        {
292            // Check a whole code point
293            if(! valid(in))
294                return false;
295        }
296        else
297        {
298            // Calculate how many chars we need
299            // to finish this partial code point
300            need_ = need - n;
301
302            // Save the partial code point
303            while(n--)
304                *p_++ = *in++;
305            BOOST_ASSERT(in == end);
306            BOOST_ASSERT(p_ <= cp_ + 4);
307
308            // Do partial validation on the incomplete
309            // code point, this is called "Fail fast"
310            // in Autobahn|Testsuite parlance.
311            return ! fail_fast();
312        }
313    }
314    return true;
315}
316
317bool
318check_utf8(char const* p, std::size_t n)
319{
320    utf8_checker c;
321    if(! c.write(reinterpret_cast<const uint8_t*>(p), n))
322        return false;
323    return c.finish();
324}
325
326} // detail
327} // websocket
328} // beast
329} // boost
330
331#endif // BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP
332