• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *
3  * Copyright (c) 2004
4  * John Maddock
5  *
6  * Use, modification and distribution are subject to the
7  * Boost Software License, Version 1.0. (See accompanying file
8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9  *
10  */
11 
12  /*
13   *   LOCATION:    see http://www.boost.org for most recent version.
14   *   FILE         unicode_iterator.hpp
15   *   VERSION      see <boost/version.hpp>
16   *   DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
17   */
18 
19 /****************************************************************************
20 
21 Contents:
22 ~~~~~~~~~
23 
24 1) Read Only, Input Adapters:
25 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 
27 template <class BaseIterator, class U8Type = std::uint8_t>
28 class u32_to_u8_iterator;
29 
30 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
31 
32 template <class BaseIterator, class U32Type = std::uint32_t>
33 class u8_to_u32_iterator;
34 
35 Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
36 
37 template <class BaseIterator, class U16Type = std::uint16_t>
38 class u32_to_u16_iterator;
39 
40 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
41 
42 template <class BaseIterator, class U32Type = std::uint32_t>
43 class u16_to_u32_iterator;
44 
45 Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
46 
47 2) Single pass output iterator adapters:
48 
49 template <class BaseIterator>
50 class utf8_output_iterator;
51 
52 Accepts UTF-32 code points and forwards them on as UTF-8 code points.
53 
54 template <class BaseIterator>
55 class utf16_output_iterator;
56 
57 Accepts UTF-32 code points and forwards them on as UTF-16 code points.
58 
59 ****************************************************************************/
60 
61 #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
62 #define BOOST_REGEX_UNICODE_ITERATOR_HPP
63 #include <cstdint>
64 #include <boost/regex/config.hpp>
65 #include <stdexcept>
66 #include <sstream>
67 #include <ios>
68 #include <limits.h> // CHAR_BIT
69 
70 #include <iostream>
71 
72 #ifndef BOOST_REGEX_STANDALONE
73 #include <boost/throw_exception.hpp>
74 #endif
75 
76 namespace boost{
77 
78 namespace detail{
79 
80 static const std::uint16_t high_surrogate_base = 0xD7C0u;
81 static const std::uint16_t low_surrogate_base = 0xDC00u;
82 static const std::uint32_t ten_bit_mask = 0x3FFu;
83 
is_high_surrogate(std::uint16_t v)84 inline bool is_high_surrogate(std::uint16_t v)
85 {
86    return (v & 0xFFFFFC00u) == 0xd800u;
87 }
is_low_surrogate(std::uint16_t v)88 inline bool is_low_surrogate(std::uint16_t v)
89 {
90    return (v & 0xFFFFFC00u) == 0xdc00u;
91 }
92 template <class T>
is_surrogate(T v)93 inline bool is_surrogate(T v)
94 {
95    return (v & 0xFFFFF800u) == 0xd800;
96 }
97 
utf8_byte_count(std::uint8_t c)98 inline unsigned utf8_byte_count(std::uint8_t c)
99 {
100    // if the most significant bit with a zero in it is in position
101    // 8-N then there are N bytes in this UTF-8 sequence:
102    std::uint8_t mask = 0x80u;
103    unsigned result = 0;
104    while(c & mask)
105    {
106       ++result;
107       mask >>= 1;
108    }
109    return (result == 0) ? 1 : ((result > 4) ? 4 : result);
110 }
111 
utf8_trailing_byte_count(std::uint8_t c)112 inline unsigned utf8_trailing_byte_count(std::uint8_t c)
113 {
114    return utf8_byte_count(c) - 1;
115 }
116 
117 #ifdef BOOST_REGEX_MSVC
118 #pragma warning(push)
119 #pragma warning(disable:4100)
120 #endif
121 #ifndef BOOST_NO_EXCEPTIONS
122 BOOST_REGEX_NORETURN
123 #endif
invalid_utf32_code_point(std::uint32_t val)124 inline void invalid_utf32_code_point(std::uint32_t val)
125 {
126    std::stringstream ss;
127    ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
128    std::out_of_range e(ss.str());
129 #ifndef BOOST_REGEX_STANDALONE
130    boost::throw_exception(e);
131 #else
132    throw e;
133 #endif
134 }
135 #ifdef BOOST_REGEX_MSVC
136 #pragma warning(pop)
137 #endif
138 
139 
140 } // namespace detail
141 
142 template <class BaseIterator, class U16Type = std::uint16_t>
143 class u32_to_u16_iterator
144 {
145    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
146 
147    static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
148    static_assert(sizeof(U16Type)*CHAR_BIT == 16, "Incorrectly sized template argument");
149 
150 public:
151    typedef std::ptrdiff_t     difference_type;
152    typedef U16Type            value_type;
153    typedef value_type const*  pointer;
154    typedef value_type const   reference;
155    typedef std::bidirectional_iterator_tag iterator_category;
156 
operator *() const157    reference operator*()const
158    {
159       if(m_current == 2)
160          extract_current();
161       return m_values[m_current];
162    }
operator ==(const u32_to_u16_iterator & that) const163    bool operator==(const u32_to_u16_iterator& that)const
164    {
165       if(m_position == that.m_position)
166       {
167          // Both m_currents must be equal, or both even
168          // this is the same as saying their sum must be even:
169          return (m_current + that.m_current) & 1u ? false : true;
170       }
171       return false;
172    }
operator !=(const u32_to_u16_iterator & that) const173    bool operator!=(const u32_to_u16_iterator& that)const
174    {
175       return !(*this == that);
176    }
operator ++()177    u32_to_u16_iterator& operator++()
178    {
179       // if we have a pending read then read now, so that we know whether
180       // to skip a position, or move to a low-surrogate:
181       if(m_current == 2)
182       {
183          // pending read:
184          extract_current();
185       }
186       // move to the next surrogate position:
187       ++m_current;
188       // if we've reached the end skip a position:
189       if(m_values[m_current] == 0)
190       {
191          m_current = 2;
192          ++m_position;
193       }
194       return *this;
195    }
operator ++(int)196    u32_to_u16_iterator operator++(int)
197    {
198       u32_to_u16_iterator r(*this);
199       ++(*this);
200       return r;
201    }
operator --()202    u32_to_u16_iterator& operator--()
203    {
204       if(m_current != 1)
205       {
206          // decrementing an iterator always leads to a valid position:
207          --m_position;
208          extract_current();
209          m_current = m_values[1] ? 1 : 0;
210       }
211       else
212       {
213          m_current = 0;
214       }
215       return *this;
216    }
operator --(int)217    u32_to_u16_iterator operator--(int)
218    {
219       u32_to_u16_iterator r(*this);
220       --(*this);
221       return r;
222    }
base() const223    BaseIterator base()const
224    {
225       return m_position;
226    }
227    // construct:
u32_to_u16_iterator()228    u32_to_u16_iterator() : m_position(), m_current(0)
229    {
230       m_values[0] = 0;
231       m_values[1] = 0;
232       m_values[2] = 0;
233    }
u32_to_u16_iterator(BaseIterator b)234    u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
235    {
236       m_values[0] = 0;
237       m_values[1] = 0;
238       m_values[2] = 0;
239    }
240 private:
241 
extract_current() const242    void extract_current()const
243    {
244       // begin by checking for a code point out of range:
245       std::uint32_t v = *m_position;
246       if(v >= 0x10000u)
247       {
248          if(v > 0x10FFFFu)
249             detail::invalid_utf32_code_point(*m_position);
250          // split into two surrogates:
251          m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
252          m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
253          m_current = 0;
254          BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0]));
255          BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1]));
256       }
257       else
258       {
259          // 16-bit code point:
260          m_values[0] = static_cast<U16Type>(*m_position);
261          m_values[1] = 0;
262          m_current = 0;
263          // value must not be a surrogate:
264          if(detail::is_surrogate(m_values[0]))
265             detail::invalid_utf32_code_point(*m_position);
266       }
267    }
268    BaseIterator m_position;
269    mutable U16Type m_values[3];
270    mutable unsigned m_current;
271 };
272 
273 template <class BaseIterator, class U32Type = std::uint32_t>
274 class u16_to_u32_iterator
275 {
276    // special values for pending iterator reads:
277    static const U32Type pending_read = 0xffffffffu;
278 
279    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
280 
281    static_assert(sizeof(base_value_type)*CHAR_BIT == 16, "Incorrectly sized template argument");
282    static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
283 
284 public:
285    typedef std::ptrdiff_t     difference_type;
286    typedef U32Type            value_type;
287    typedef value_type const*  pointer;
288    typedef value_type const   reference;
289    typedef std::bidirectional_iterator_tag iterator_category;
290 
operator *() const291    reference operator*()const
292    {
293       if(m_value == pending_read)
294          extract_current();
295       return m_value;
296    }
operator ==(const u16_to_u32_iterator & that) const297    bool operator==(const u16_to_u32_iterator& that)const
298    {
299       return m_position == that.m_position;
300    }
operator !=(const u16_to_u32_iterator & that) const301    bool operator!=(const u16_to_u32_iterator& that)const
302    {
303       return !(*this == that);
304    }
operator ++()305    u16_to_u32_iterator& operator++()
306    {
307       // skip high surrogate first if there is one:
308       if(detail::is_high_surrogate(*m_position)) ++m_position;
309       ++m_position;
310       m_value = pending_read;
311       return *this;
312    }
operator ++(int)313    u16_to_u32_iterator operator++(int)
314    {
315       u16_to_u32_iterator r(*this);
316       ++(*this);
317       return r;
318    }
operator --()319    u16_to_u32_iterator& operator--()
320    {
321       --m_position;
322       // if we have a low surrogate then go back one more:
323       if(detail::is_low_surrogate(*m_position))
324          --m_position;
325       m_value = pending_read;
326       return *this;
327    }
operator --(int)328    u16_to_u32_iterator operator--(int)
329    {
330       u16_to_u32_iterator r(*this);
331       --(*this);
332       return r;
333    }
base() const334    BaseIterator base()const
335    {
336       return m_position;
337    }
338    // construct:
u16_to_u32_iterator()339    u16_to_u32_iterator() : m_position()
340    {
341       m_value = pending_read;
342    }
u16_to_u32_iterator(BaseIterator b)343    u16_to_u32_iterator(BaseIterator b) : m_position(b)
344    {
345       m_value = pending_read;
346    }
347    //
348    // Range checked version:
349    //
u16_to_u32_iterator(BaseIterator b,BaseIterator start,BaseIterator end)350    u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
351    {
352       m_value = pending_read;
353       //
354       // The range must not start with a low surrogate, or end in a high surrogate,
355       // otherwise we run the risk of running outside the underlying input range.
356       // Likewise b must not be located at a low surrogate.
357       //
358       std::uint16_t val;
359       if(start != end)
360       {
361          if((b != start) && (b != end))
362          {
363             val = *b;
364             if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
365                invalid_code_point(val);
366          }
367          val = *start;
368          if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
369             invalid_code_point(val);
370          val = *--end;
371          if(detail::is_high_surrogate(val))
372             invalid_code_point(val);
373       }
374    }
375 private:
invalid_code_point(std::uint16_t val)376    static void invalid_code_point(std::uint16_t val)
377    {
378       std::stringstream ss;
379       ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
380       std::out_of_range e(ss.str());
381 #ifndef BOOST_REGEX_STANDALONE
382       boost::throw_exception(e);
383 #else
384       throw e;
385 #endif
386    }
extract_current() const387    void extract_current()const
388    {
389       m_value = static_cast<U32Type>(static_cast< std::uint16_t>(*m_position));
390       // if the last value is a high surrogate then adjust m_position and m_value as needed:
391       if(detail::is_high_surrogate(*m_position))
392       {
393          // precondition; next value must have be a low-surrogate:
394          BaseIterator next(m_position);
395          std::uint16_t t = *++next;
396          if((t & 0xFC00u) != 0xDC00u)
397             invalid_code_point(t);
398          m_value = (m_value - detail::high_surrogate_base) << 10;
399          m_value |= (static_cast<U32Type>(static_cast< std::uint16_t>(t)) & detail::ten_bit_mask);
400       }
401       // postcondition; result must not be a surrogate:
402       if(detail::is_surrogate(m_value))
403          invalid_code_point(static_cast< std::uint16_t>(m_value));
404    }
405    BaseIterator m_position;
406    mutable U32Type m_value;
407 };
408 
409 template <class BaseIterator, class U8Type = std::uint8_t>
410 class u32_to_u8_iterator
411 {
412    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
413 
414    static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
415    static_assert(sizeof(U8Type)*CHAR_BIT == 8, "Incorrectly sized template argument");
416 
417 public:
418    typedef std::ptrdiff_t     difference_type;
419    typedef U8Type             value_type;
420    typedef value_type const*  pointer;
421    typedef value_type const   reference;
422    typedef std::bidirectional_iterator_tag iterator_category;
423 
operator *() const424    reference operator*()const
425    {
426       if(m_current == 4)
427          extract_current();
428       return m_values[m_current];
429    }
operator ==(const u32_to_u8_iterator & that) const430    bool operator==(const u32_to_u8_iterator& that)const
431    {
432       if(m_position == that.m_position)
433       {
434          // either the m_current's must be equal, or one must be 0 and
435          // the other 4: which means neither must have bits 1 or 2 set:
436          return (m_current == that.m_current)
437             || (((m_current | that.m_current) & 3) == 0);
438       }
439       return false;
440    }
operator !=(const u32_to_u8_iterator & that) const441    bool operator!=(const u32_to_u8_iterator& that)const
442    {
443       return !(*this == that);
444    }
operator ++()445    u32_to_u8_iterator& operator++()
446    {
447       // if we have a pending read then read now, so that we know whether
448       // to skip a position, or move to a low-surrogate:
449       if(m_current == 4)
450       {
451          // pending read:
452          extract_current();
453       }
454       // move to the next surrogate position:
455       ++m_current;
456       // if we've reached the end skip a position:
457       if(m_values[m_current] == 0)
458       {
459          m_current = 4;
460          ++m_position;
461       }
462       return *this;
463    }
operator ++(int)464    u32_to_u8_iterator operator++(int)
465    {
466       u32_to_u8_iterator r(*this);
467       ++(*this);
468       return r;
469    }
operator --()470    u32_to_u8_iterator& operator--()
471    {
472       if((m_current & 3) == 0)
473       {
474          --m_position;
475          extract_current();
476          m_current = 3;
477          while(m_current && (m_values[m_current] == 0))
478             --m_current;
479       }
480       else
481          --m_current;
482       return *this;
483    }
operator --(int)484    u32_to_u8_iterator operator--(int)
485    {
486       u32_to_u8_iterator r(*this);
487       --(*this);
488       return r;
489    }
base() const490    BaseIterator base()const
491    {
492       return m_position;
493    }
494    // construct:
u32_to_u8_iterator()495    u32_to_u8_iterator() : m_position(), m_current(0)
496    {
497       m_values[0] = 0;
498       m_values[1] = 0;
499       m_values[2] = 0;
500       m_values[3] = 0;
501       m_values[4] = 0;
502    }
u32_to_u8_iterator(BaseIterator b)503    u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
504    {
505       m_values[0] = 0;
506       m_values[1] = 0;
507       m_values[2] = 0;
508       m_values[3] = 0;
509       m_values[4] = 0;
510    }
511 private:
512 
extract_current() const513    void extract_current()const
514    {
515       std::uint32_t c = *m_position;
516       if(c > 0x10FFFFu)
517          detail::invalid_utf32_code_point(c);
518       if(c < 0x80u)
519       {
520          m_values[0] = static_cast<unsigned char>(c);
521          m_values[1] = static_cast<unsigned char>(0u);
522          m_values[2] = static_cast<unsigned char>(0u);
523          m_values[3] = static_cast<unsigned char>(0u);
524       }
525       else if(c < 0x800u)
526       {
527          m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
528          m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
529          m_values[2] = static_cast<unsigned char>(0u);
530          m_values[3] = static_cast<unsigned char>(0u);
531       }
532       else if(c < 0x10000u)
533       {
534          m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
535          m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
536          m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
537          m_values[3] = static_cast<unsigned char>(0u);
538       }
539       else
540       {
541          m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
542          m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
543          m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
544          m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
545       }
546       m_current= 0;
547    }
548    BaseIterator m_position;
549    mutable U8Type m_values[5];
550    mutable unsigned m_current;
551 };
552 
553 template <class BaseIterator, class U32Type = std::uint32_t>
554 class u8_to_u32_iterator
555 {
556    // special values for pending iterator reads:
557    static const U32Type pending_read = 0xffffffffu;
558 
559    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
560 
561    static_assert(sizeof(base_value_type)*CHAR_BIT == 8, "Incorrectly sized template argument");
562    static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
563 
564 public:
565    typedef std::ptrdiff_t     difference_type;
566    typedef U32Type            value_type;
567    typedef value_type const*  pointer;
568    typedef value_type const   reference;
569    typedef std::bidirectional_iterator_tag iterator_category;
570 
operator *() const571    reference operator*()const
572    {
573       if(m_value == pending_read)
574          extract_current();
575       return m_value;
576    }
operator ==(const u8_to_u32_iterator & that) const577    bool operator==(const u8_to_u32_iterator& that)const
578    {
579       return m_position == that.m_position;
580    }
operator !=(const u8_to_u32_iterator & that) const581    bool operator!=(const u8_to_u32_iterator& that)const
582    {
583       return !(*this == that);
584    }
operator ++()585    u8_to_u32_iterator& operator++()
586    {
587       // We must not start with a continuation character:
588       if((static_cast<std::uint8_t>(*m_position) & 0xC0) == 0x80)
589          invalid_sequence();
590       // skip high surrogate first if there is one:
591       unsigned c = detail::utf8_byte_count(*m_position);
592       if(m_value == pending_read)
593       {
594          // Since we haven't read in a value, we need to validate the code points:
595          for(unsigned i = 0; i < c; ++i)
596          {
597             ++m_position;
598             // We must have a continuation byte:
599             if((i != c - 1) && ((static_cast<std::uint8_t>(*m_position) & 0xC0) != 0x80))
600                invalid_sequence();
601          }
602       }
603       else
604       {
605          std::advance(m_position, c);
606       }
607       m_value = pending_read;
608       return *this;
609    }
operator ++(int)610    u8_to_u32_iterator operator++(int)
611    {
612       u8_to_u32_iterator r(*this);
613       ++(*this);
614       return r;
615    }
operator --()616    u8_to_u32_iterator& operator--()
617    {
618       // Keep backtracking until we don't have a trailing character:
619       unsigned count = 0;
620       while((*--m_position & 0xC0u) == 0x80u) ++count;
621       // now check that the sequence was valid:
622       if(count != detail::utf8_trailing_byte_count(*m_position))
623          invalid_sequence();
624       m_value = pending_read;
625       return *this;
626    }
operator --(int)627    u8_to_u32_iterator operator--(int)
628    {
629       u8_to_u32_iterator r(*this);
630       --(*this);
631       return r;
632    }
base() const633    BaseIterator base()const
634    {
635       return m_position;
636    }
637    // construct:
u8_to_u32_iterator()638    u8_to_u32_iterator() : m_position()
639    {
640       m_value = pending_read;
641    }
u8_to_u32_iterator(BaseIterator b)642    u8_to_u32_iterator(BaseIterator b) : m_position(b)
643    {
644       m_value = pending_read;
645    }
646    //
647    // Checked constructor:
648    //
u8_to_u32_iterator(BaseIterator b,BaseIterator start,BaseIterator end)649    u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
650    {
651       m_value = pending_read;
652       //
653       // We must not start with a continuation character, or end with a
654       // truncated UTF-8 sequence otherwise we run the risk of going past
655       // the start/end of the underlying sequence:
656       //
657       if(start != end)
658       {
659          unsigned char v = *start;
660          if((v & 0xC0u) == 0x80u)
661             invalid_sequence();
662          if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
663             invalid_sequence();
664          BaseIterator pos = end;
665          do
666          {
667             v = *--pos;
668          }
669          while((start != pos) && ((v & 0xC0u) == 0x80u));
670          std::ptrdiff_t extra = detail::utf8_byte_count(v);
671          if(std::distance(pos, end) < extra)
672             invalid_sequence();
673       }
674    }
675 private:
invalid_sequence()676    static void invalid_sequence()
677    {
678       std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
679 #ifndef BOOST_REGEX_STANDALONE
680       boost::throw_exception(e);
681 #else
682       throw e;
683 #endif
684    }
extract_current() const685    void extract_current()const
686    {
687       m_value = static_cast<U32Type>(static_cast< std::uint8_t>(*m_position));
688       // we must not have a continuation character:
689       if((m_value & 0xC0u) == 0x80u)
690          invalid_sequence();
691       // see how many extra bytes we have:
692       unsigned extra = detail::utf8_trailing_byte_count(*m_position);
693       // extract the extra bits, 6 from each extra byte:
694       BaseIterator next(m_position);
695       for(unsigned c = 0; c < extra; ++c)
696       {
697          ++next;
698          m_value <<= 6;
699          // We must have a continuation byte:
700          if((static_cast<std::uint8_t>(*next) & 0xC0) != 0x80)
701             invalid_sequence();
702          m_value += static_cast<std::uint8_t>(*next) & 0x3Fu;
703       }
704       // we now need to remove a few of the leftmost bits, but how many depends
705       // upon how many extra bytes we've extracted:
706       static const std::uint32_t masks[4] =
707       {
708          0x7Fu,
709          0x7FFu,
710          0xFFFFu,
711          0x1FFFFFu,
712       };
713       m_value &= masks[extra];
714       // check the result is in range:
715       if(m_value > static_cast<U32Type>(0x10FFFFu))
716          invalid_sequence();
717       // The result must not be a surrogate:
718       if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
719          invalid_sequence();
720       // We should not have had an invalidly encoded UTF8 sequence:
721       if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
722          invalid_sequence();
723    }
724    BaseIterator m_position;
725    mutable U32Type m_value;
726 };
727 
728 template <class BaseIterator>
729 class utf16_output_iterator
730 {
731 public:
732    typedef void                                   difference_type;
733    typedef void                                   value_type;
734    typedef std::uint32_t*                         pointer;
735    typedef std::uint32_t&                         reference;
736    typedef std::output_iterator_tag               iterator_category;
737 
utf16_output_iterator(const BaseIterator & b)738    utf16_output_iterator(const BaseIterator& b)
739       : m_position(b){}
utf16_output_iterator(const utf16_output_iterator & that)740    utf16_output_iterator(const utf16_output_iterator& that)
741       : m_position(that.m_position){}
operator =(const utf16_output_iterator & that)742    utf16_output_iterator& operator=(const utf16_output_iterator& that)
743    {
744       m_position = that.m_position;
745       return *this;
746    }
operator *() const747    const utf16_output_iterator& operator*()const
748    {
749       return *this;
750    }
operator =(std::uint32_t val) const751    void operator=(std::uint32_t val)const
752    {
753       push(val);
754    }
operator ++()755    utf16_output_iterator& operator++()
756    {
757       return *this;
758    }
operator ++(int)759    utf16_output_iterator& operator++(int)
760    {
761       return *this;
762    }
base() const763    BaseIterator base()const
764    {
765       return m_position;
766    }
767 private:
push(std::uint32_t v) const768    void push(std::uint32_t v)const
769    {
770       if(v >= 0x10000u)
771       {
772          // begin by checking for a code point out of range:
773          if(v > 0x10FFFFu)
774             detail::invalid_utf32_code_point(v);
775          // split into two surrogates:
776          *m_position++ = static_cast<std::uint16_t>(v >> 10) + detail::high_surrogate_base;
777          *m_position++ = static_cast<std::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
778       }
779       else
780       {
781          // 16-bit code point:
782          // value must not be a surrogate:
783          if(detail::is_surrogate(v))
784             detail::invalid_utf32_code_point(v);
785          *m_position++ = static_cast<std::uint16_t>(v);
786       }
787    }
788    mutable BaseIterator m_position;
789 };
790 
791 template <class BaseIterator>
792 class utf8_output_iterator
793 {
794 public:
795    typedef void                                   difference_type;
796    typedef void                                   value_type;
797    typedef std::uint32_t*                       pointer;
798    typedef std::uint32_t&                       reference;
799    typedef std::output_iterator_tag               iterator_category;
800 
utf8_output_iterator(const BaseIterator & b)801    utf8_output_iterator(const BaseIterator& b)
802       : m_position(b){}
utf8_output_iterator(const utf8_output_iterator & that)803    utf8_output_iterator(const utf8_output_iterator& that)
804       : m_position(that.m_position){}
operator =(const utf8_output_iterator & that)805    utf8_output_iterator& operator=(const utf8_output_iterator& that)
806    {
807       m_position = that.m_position;
808       return *this;
809    }
operator *() const810    const utf8_output_iterator& operator*()const
811    {
812       return *this;
813    }
operator =(std::uint32_t val) const814    void operator=(std::uint32_t val)const
815    {
816       push(val);
817    }
operator ++()818    utf8_output_iterator& operator++()
819    {
820       return *this;
821    }
operator ++(int)822    utf8_output_iterator& operator++(int)
823    {
824       return *this;
825    }
base() const826    BaseIterator base()const
827    {
828       return m_position;
829    }
830 private:
push(std::uint32_t c) const831    void push(std::uint32_t c)const
832    {
833       if(c > 0x10FFFFu)
834          detail::invalid_utf32_code_point(c);
835       if(c < 0x80u)
836       {
837          *m_position++ = static_cast<unsigned char>(c);
838       }
839       else if(c < 0x800u)
840       {
841          *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
842          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
843       }
844       else if(c < 0x10000u)
845       {
846          *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
847          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
848          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
849       }
850       else
851       {
852          *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
853          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
854          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
855          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
856       }
857    }
858    mutable BaseIterator m_position;
859 };
860 
861 } // namespace boost
862 
863 #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
864 
865