1 /*
2 *
3 * Copyright (c) 2004
4 * John Maddock
5 *
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 *
10 */
11
12 /*
13 * LOCATION: see http://www.boost.org for most recent version.
14 * FILE unicode_iterator.hpp
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
17 */
18
19 /****************************************************************************
20
21 Contents:
22 ~~~~~~~~~
23
24 1) Read Only, Input Adapters:
25 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26
27 template <class BaseIterator, class U8Type = std::uint8_t>
28 class u32_to_u8_iterator;
29
30 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
31
32 template <class BaseIterator, class U32Type = std::uint32_t>
33 class u8_to_u32_iterator;
34
35 Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
36
37 template <class BaseIterator, class U16Type = std::uint16_t>
38 class u32_to_u16_iterator;
39
40 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
41
42 template <class BaseIterator, class U32Type = std::uint32_t>
43 class u16_to_u32_iterator;
44
45 Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
46
47 2) Single pass output iterator adapters:
48
49 template <class BaseIterator>
50 class utf8_output_iterator;
51
52 Accepts UTF-32 code points and forwards them on as UTF-8 code points.
53
54 template <class BaseIterator>
55 class utf16_output_iterator;
56
57 Accepts UTF-32 code points and forwards them on as UTF-16 code points.
58
59 ****************************************************************************/
60
61 #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
62 #define BOOST_REGEX_UNICODE_ITERATOR_HPP
63 #include <cstdint>
64 #include <boost/regex/config.hpp>
65 #include <stdexcept>
66 #include <sstream>
67 #include <ios>
68 #include <limits.h> // CHAR_BIT
69
70 #include <iostream>
71
72 #ifndef BOOST_REGEX_STANDALONE
73 #include <boost/throw_exception.hpp>
74 #endif
75
76 namespace boost{
77
78 namespace detail{
79
80 static const std::uint16_t high_surrogate_base = 0xD7C0u;
81 static const std::uint16_t low_surrogate_base = 0xDC00u;
82 static const std::uint32_t ten_bit_mask = 0x3FFu;
83
is_high_surrogate(std::uint16_t v)84 inline bool is_high_surrogate(std::uint16_t v)
85 {
86 return (v & 0xFFFFFC00u) == 0xd800u;
87 }
is_low_surrogate(std::uint16_t v)88 inline bool is_low_surrogate(std::uint16_t v)
89 {
90 return (v & 0xFFFFFC00u) == 0xdc00u;
91 }
92 template <class T>
is_surrogate(T v)93 inline bool is_surrogate(T v)
94 {
95 return (v & 0xFFFFF800u) == 0xd800;
96 }
97
utf8_byte_count(std::uint8_t c)98 inline unsigned utf8_byte_count(std::uint8_t c)
99 {
100 // if the most significant bit with a zero in it is in position
101 // 8-N then there are N bytes in this UTF-8 sequence:
102 std::uint8_t mask = 0x80u;
103 unsigned result = 0;
104 while(c & mask)
105 {
106 ++result;
107 mask >>= 1;
108 }
109 return (result == 0) ? 1 : ((result > 4) ? 4 : result);
110 }
111
utf8_trailing_byte_count(std::uint8_t c)112 inline unsigned utf8_trailing_byte_count(std::uint8_t c)
113 {
114 return utf8_byte_count(c) - 1;
115 }
116
117 #ifdef BOOST_REGEX_MSVC
118 #pragma warning(push)
119 #pragma warning(disable:4100)
120 #endif
121 #ifndef BOOST_NO_EXCEPTIONS
122 BOOST_REGEX_NORETURN
123 #endif
invalid_utf32_code_point(std::uint32_t val)124 inline void invalid_utf32_code_point(std::uint32_t val)
125 {
126 std::stringstream ss;
127 ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
128 std::out_of_range e(ss.str());
129 #ifndef BOOST_REGEX_STANDALONE
130 boost::throw_exception(e);
131 #else
132 throw e;
133 #endif
134 }
135 #ifdef BOOST_REGEX_MSVC
136 #pragma warning(pop)
137 #endif
138
139
140 } // namespace detail
141
142 template <class BaseIterator, class U16Type = std::uint16_t>
143 class u32_to_u16_iterator
144 {
145 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
146
147 static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
148 static_assert(sizeof(U16Type)*CHAR_BIT == 16, "Incorrectly sized template argument");
149
150 public:
151 typedef std::ptrdiff_t difference_type;
152 typedef U16Type value_type;
153 typedef value_type const* pointer;
154 typedef value_type const reference;
155 typedef std::bidirectional_iterator_tag iterator_category;
156
operator *() const157 reference operator*()const
158 {
159 if(m_current == 2)
160 extract_current();
161 return m_values[m_current];
162 }
operator ==(const u32_to_u16_iterator & that) const163 bool operator==(const u32_to_u16_iterator& that)const
164 {
165 if(m_position == that.m_position)
166 {
167 // Both m_currents must be equal, or both even
168 // this is the same as saying their sum must be even:
169 return (m_current + that.m_current) & 1u ? false : true;
170 }
171 return false;
172 }
operator !=(const u32_to_u16_iterator & that) const173 bool operator!=(const u32_to_u16_iterator& that)const
174 {
175 return !(*this == that);
176 }
operator ++()177 u32_to_u16_iterator& operator++()
178 {
179 // if we have a pending read then read now, so that we know whether
180 // to skip a position, or move to a low-surrogate:
181 if(m_current == 2)
182 {
183 // pending read:
184 extract_current();
185 }
186 // move to the next surrogate position:
187 ++m_current;
188 // if we've reached the end skip a position:
189 if(m_values[m_current] == 0)
190 {
191 m_current = 2;
192 ++m_position;
193 }
194 return *this;
195 }
operator ++(int)196 u32_to_u16_iterator operator++(int)
197 {
198 u32_to_u16_iterator r(*this);
199 ++(*this);
200 return r;
201 }
operator --()202 u32_to_u16_iterator& operator--()
203 {
204 if(m_current != 1)
205 {
206 // decrementing an iterator always leads to a valid position:
207 --m_position;
208 extract_current();
209 m_current = m_values[1] ? 1 : 0;
210 }
211 else
212 {
213 m_current = 0;
214 }
215 return *this;
216 }
operator --(int)217 u32_to_u16_iterator operator--(int)
218 {
219 u32_to_u16_iterator r(*this);
220 --(*this);
221 return r;
222 }
base() const223 BaseIterator base()const
224 {
225 return m_position;
226 }
227 // construct:
u32_to_u16_iterator()228 u32_to_u16_iterator() : m_position(), m_current(0)
229 {
230 m_values[0] = 0;
231 m_values[1] = 0;
232 m_values[2] = 0;
233 }
u32_to_u16_iterator(BaseIterator b)234 u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
235 {
236 m_values[0] = 0;
237 m_values[1] = 0;
238 m_values[2] = 0;
239 }
240 private:
241
extract_current() const242 void extract_current()const
243 {
244 // begin by checking for a code point out of range:
245 std::uint32_t v = *m_position;
246 if(v >= 0x10000u)
247 {
248 if(v > 0x10FFFFu)
249 detail::invalid_utf32_code_point(*m_position);
250 // split into two surrogates:
251 m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
252 m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
253 m_current = 0;
254 BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0]));
255 BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1]));
256 }
257 else
258 {
259 // 16-bit code point:
260 m_values[0] = static_cast<U16Type>(*m_position);
261 m_values[1] = 0;
262 m_current = 0;
263 // value must not be a surrogate:
264 if(detail::is_surrogate(m_values[0]))
265 detail::invalid_utf32_code_point(*m_position);
266 }
267 }
268 BaseIterator m_position;
269 mutable U16Type m_values[3];
270 mutable unsigned m_current;
271 };
272
273 template <class BaseIterator, class U32Type = std::uint32_t>
274 class u16_to_u32_iterator
275 {
276 // special values for pending iterator reads:
277 static const U32Type pending_read = 0xffffffffu;
278
279 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
280
281 static_assert(sizeof(base_value_type)*CHAR_BIT == 16, "Incorrectly sized template argument");
282 static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
283
284 public:
285 typedef std::ptrdiff_t difference_type;
286 typedef U32Type value_type;
287 typedef value_type const* pointer;
288 typedef value_type const reference;
289 typedef std::bidirectional_iterator_tag iterator_category;
290
operator *() const291 reference operator*()const
292 {
293 if(m_value == pending_read)
294 extract_current();
295 return m_value;
296 }
operator ==(const u16_to_u32_iterator & that) const297 bool operator==(const u16_to_u32_iterator& that)const
298 {
299 return m_position == that.m_position;
300 }
operator !=(const u16_to_u32_iterator & that) const301 bool operator!=(const u16_to_u32_iterator& that)const
302 {
303 return !(*this == that);
304 }
operator ++()305 u16_to_u32_iterator& operator++()
306 {
307 // skip high surrogate first if there is one:
308 if(detail::is_high_surrogate(*m_position)) ++m_position;
309 ++m_position;
310 m_value = pending_read;
311 return *this;
312 }
operator ++(int)313 u16_to_u32_iterator operator++(int)
314 {
315 u16_to_u32_iterator r(*this);
316 ++(*this);
317 return r;
318 }
operator --()319 u16_to_u32_iterator& operator--()
320 {
321 --m_position;
322 // if we have a low surrogate then go back one more:
323 if(detail::is_low_surrogate(*m_position))
324 --m_position;
325 m_value = pending_read;
326 return *this;
327 }
operator --(int)328 u16_to_u32_iterator operator--(int)
329 {
330 u16_to_u32_iterator r(*this);
331 --(*this);
332 return r;
333 }
base() const334 BaseIterator base()const
335 {
336 return m_position;
337 }
338 // construct:
u16_to_u32_iterator()339 u16_to_u32_iterator() : m_position()
340 {
341 m_value = pending_read;
342 }
u16_to_u32_iterator(BaseIterator b)343 u16_to_u32_iterator(BaseIterator b) : m_position(b)
344 {
345 m_value = pending_read;
346 }
347 //
348 // Range checked version:
349 //
u16_to_u32_iterator(BaseIterator b,BaseIterator start,BaseIterator end)350 u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
351 {
352 m_value = pending_read;
353 //
354 // The range must not start with a low surrogate, or end in a high surrogate,
355 // otherwise we run the risk of running outside the underlying input range.
356 // Likewise b must not be located at a low surrogate.
357 //
358 std::uint16_t val;
359 if(start != end)
360 {
361 if((b != start) && (b != end))
362 {
363 val = *b;
364 if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
365 invalid_code_point(val);
366 }
367 val = *start;
368 if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
369 invalid_code_point(val);
370 val = *--end;
371 if(detail::is_high_surrogate(val))
372 invalid_code_point(val);
373 }
374 }
375 private:
invalid_code_point(std::uint16_t val)376 static void invalid_code_point(std::uint16_t val)
377 {
378 std::stringstream ss;
379 ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
380 std::out_of_range e(ss.str());
381 #ifndef BOOST_REGEX_STANDALONE
382 boost::throw_exception(e);
383 #else
384 throw e;
385 #endif
386 }
extract_current() const387 void extract_current()const
388 {
389 m_value = static_cast<U32Type>(static_cast< std::uint16_t>(*m_position));
390 // if the last value is a high surrogate then adjust m_position and m_value as needed:
391 if(detail::is_high_surrogate(*m_position))
392 {
393 // precondition; next value must have be a low-surrogate:
394 BaseIterator next(m_position);
395 std::uint16_t t = *++next;
396 if((t & 0xFC00u) != 0xDC00u)
397 invalid_code_point(t);
398 m_value = (m_value - detail::high_surrogate_base) << 10;
399 m_value |= (static_cast<U32Type>(static_cast< std::uint16_t>(t)) & detail::ten_bit_mask);
400 }
401 // postcondition; result must not be a surrogate:
402 if(detail::is_surrogate(m_value))
403 invalid_code_point(static_cast< std::uint16_t>(m_value));
404 }
405 BaseIterator m_position;
406 mutable U32Type m_value;
407 };
408
409 template <class BaseIterator, class U8Type = std::uint8_t>
410 class u32_to_u8_iterator
411 {
412 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
413
414 static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
415 static_assert(sizeof(U8Type)*CHAR_BIT == 8, "Incorrectly sized template argument");
416
417 public:
418 typedef std::ptrdiff_t difference_type;
419 typedef U8Type value_type;
420 typedef value_type const* pointer;
421 typedef value_type const reference;
422 typedef std::bidirectional_iterator_tag iterator_category;
423
operator *() const424 reference operator*()const
425 {
426 if(m_current == 4)
427 extract_current();
428 return m_values[m_current];
429 }
operator ==(const u32_to_u8_iterator & that) const430 bool operator==(const u32_to_u8_iterator& that)const
431 {
432 if(m_position == that.m_position)
433 {
434 // either the m_current's must be equal, or one must be 0 and
435 // the other 4: which means neither must have bits 1 or 2 set:
436 return (m_current == that.m_current)
437 || (((m_current | that.m_current) & 3) == 0);
438 }
439 return false;
440 }
operator !=(const u32_to_u8_iterator & that) const441 bool operator!=(const u32_to_u8_iterator& that)const
442 {
443 return !(*this == that);
444 }
operator ++()445 u32_to_u8_iterator& operator++()
446 {
447 // if we have a pending read then read now, so that we know whether
448 // to skip a position, or move to a low-surrogate:
449 if(m_current == 4)
450 {
451 // pending read:
452 extract_current();
453 }
454 // move to the next surrogate position:
455 ++m_current;
456 // if we've reached the end skip a position:
457 if(m_values[m_current] == 0)
458 {
459 m_current = 4;
460 ++m_position;
461 }
462 return *this;
463 }
operator ++(int)464 u32_to_u8_iterator operator++(int)
465 {
466 u32_to_u8_iterator r(*this);
467 ++(*this);
468 return r;
469 }
operator --()470 u32_to_u8_iterator& operator--()
471 {
472 if((m_current & 3) == 0)
473 {
474 --m_position;
475 extract_current();
476 m_current = 3;
477 while(m_current && (m_values[m_current] == 0))
478 --m_current;
479 }
480 else
481 --m_current;
482 return *this;
483 }
operator --(int)484 u32_to_u8_iterator operator--(int)
485 {
486 u32_to_u8_iterator r(*this);
487 --(*this);
488 return r;
489 }
base() const490 BaseIterator base()const
491 {
492 return m_position;
493 }
494 // construct:
u32_to_u8_iterator()495 u32_to_u8_iterator() : m_position(), m_current(0)
496 {
497 m_values[0] = 0;
498 m_values[1] = 0;
499 m_values[2] = 0;
500 m_values[3] = 0;
501 m_values[4] = 0;
502 }
u32_to_u8_iterator(BaseIterator b)503 u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
504 {
505 m_values[0] = 0;
506 m_values[1] = 0;
507 m_values[2] = 0;
508 m_values[3] = 0;
509 m_values[4] = 0;
510 }
511 private:
512
extract_current() const513 void extract_current()const
514 {
515 std::uint32_t c = *m_position;
516 if(c > 0x10FFFFu)
517 detail::invalid_utf32_code_point(c);
518 if(c < 0x80u)
519 {
520 m_values[0] = static_cast<unsigned char>(c);
521 m_values[1] = static_cast<unsigned char>(0u);
522 m_values[2] = static_cast<unsigned char>(0u);
523 m_values[3] = static_cast<unsigned char>(0u);
524 }
525 else if(c < 0x800u)
526 {
527 m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
528 m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
529 m_values[2] = static_cast<unsigned char>(0u);
530 m_values[3] = static_cast<unsigned char>(0u);
531 }
532 else if(c < 0x10000u)
533 {
534 m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
535 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
536 m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
537 m_values[3] = static_cast<unsigned char>(0u);
538 }
539 else
540 {
541 m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
542 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
543 m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
544 m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
545 }
546 m_current= 0;
547 }
548 BaseIterator m_position;
549 mutable U8Type m_values[5];
550 mutable unsigned m_current;
551 };
552
553 template <class BaseIterator, class U32Type = std::uint32_t>
554 class u8_to_u32_iterator
555 {
556 // special values for pending iterator reads:
557 static const U32Type pending_read = 0xffffffffu;
558
559 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
560
561 static_assert(sizeof(base_value_type)*CHAR_BIT == 8, "Incorrectly sized template argument");
562 static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
563
564 public:
565 typedef std::ptrdiff_t difference_type;
566 typedef U32Type value_type;
567 typedef value_type const* pointer;
568 typedef value_type const reference;
569 typedef std::bidirectional_iterator_tag iterator_category;
570
operator *() const571 reference operator*()const
572 {
573 if(m_value == pending_read)
574 extract_current();
575 return m_value;
576 }
operator ==(const u8_to_u32_iterator & that) const577 bool operator==(const u8_to_u32_iterator& that)const
578 {
579 return m_position == that.m_position;
580 }
operator !=(const u8_to_u32_iterator & that) const581 bool operator!=(const u8_to_u32_iterator& that)const
582 {
583 return !(*this == that);
584 }
operator ++()585 u8_to_u32_iterator& operator++()
586 {
587 // We must not start with a continuation character:
588 if((static_cast<std::uint8_t>(*m_position) & 0xC0) == 0x80)
589 invalid_sequence();
590 // skip high surrogate first if there is one:
591 unsigned c = detail::utf8_byte_count(*m_position);
592 if(m_value == pending_read)
593 {
594 // Since we haven't read in a value, we need to validate the code points:
595 for(unsigned i = 0; i < c; ++i)
596 {
597 ++m_position;
598 // We must have a continuation byte:
599 if((i != c - 1) && ((static_cast<std::uint8_t>(*m_position) & 0xC0) != 0x80))
600 invalid_sequence();
601 }
602 }
603 else
604 {
605 std::advance(m_position, c);
606 }
607 m_value = pending_read;
608 return *this;
609 }
operator ++(int)610 u8_to_u32_iterator operator++(int)
611 {
612 u8_to_u32_iterator r(*this);
613 ++(*this);
614 return r;
615 }
operator --()616 u8_to_u32_iterator& operator--()
617 {
618 // Keep backtracking until we don't have a trailing character:
619 unsigned count = 0;
620 while((*--m_position & 0xC0u) == 0x80u) ++count;
621 // now check that the sequence was valid:
622 if(count != detail::utf8_trailing_byte_count(*m_position))
623 invalid_sequence();
624 m_value = pending_read;
625 return *this;
626 }
operator --(int)627 u8_to_u32_iterator operator--(int)
628 {
629 u8_to_u32_iterator r(*this);
630 --(*this);
631 return r;
632 }
base() const633 BaseIterator base()const
634 {
635 return m_position;
636 }
637 // construct:
u8_to_u32_iterator()638 u8_to_u32_iterator() : m_position()
639 {
640 m_value = pending_read;
641 }
u8_to_u32_iterator(BaseIterator b)642 u8_to_u32_iterator(BaseIterator b) : m_position(b)
643 {
644 m_value = pending_read;
645 }
646 //
647 // Checked constructor:
648 //
u8_to_u32_iterator(BaseIterator b,BaseIterator start,BaseIterator end)649 u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
650 {
651 m_value = pending_read;
652 //
653 // We must not start with a continuation character, or end with a
654 // truncated UTF-8 sequence otherwise we run the risk of going past
655 // the start/end of the underlying sequence:
656 //
657 if(start != end)
658 {
659 unsigned char v = *start;
660 if((v & 0xC0u) == 0x80u)
661 invalid_sequence();
662 if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
663 invalid_sequence();
664 BaseIterator pos = end;
665 do
666 {
667 v = *--pos;
668 }
669 while((start != pos) && ((v & 0xC0u) == 0x80u));
670 std::ptrdiff_t extra = detail::utf8_byte_count(v);
671 if(std::distance(pos, end) < extra)
672 invalid_sequence();
673 }
674 }
675 private:
invalid_sequence()676 static void invalid_sequence()
677 {
678 std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
679 #ifndef BOOST_REGEX_STANDALONE
680 boost::throw_exception(e);
681 #else
682 throw e;
683 #endif
684 }
extract_current() const685 void extract_current()const
686 {
687 m_value = static_cast<U32Type>(static_cast< std::uint8_t>(*m_position));
688 // we must not have a continuation character:
689 if((m_value & 0xC0u) == 0x80u)
690 invalid_sequence();
691 // see how many extra bytes we have:
692 unsigned extra = detail::utf8_trailing_byte_count(*m_position);
693 // extract the extra bits, 6 from each extra byte:
694 BaseIterator next(m_position);
695 for(unsigned c = 0; c < extra; ++c)
696 {
697 ++next;
698 m_value <<= 6;
699 // We must have a continuation byte:
700 if((static_cast<std::uint8_t>(*next) & 0xC0) != 0x80)
701 invalid_sequence();
702 m_value += static_cast<std::uint8_t>(*next) & 0x3Fu;
703 }
704 // we now need to remove a few of the leftmost bits, but how many depends
705 // upon how many extra bytes we've extracted:
706 static const std::uint32_t masks[4] =
707 {
708 0x7Fu,
709 0x7FFu,
710 0xFFFFu,
711 0x1FFFFFu,
712 };
713 m_value &= masks[extra];
714 // check the result is in range:
715 if(m_value > static_cast<U32Type>(0x10FFFFu))
716 invalid_sequence();
717 // The result must not be a surrogate:
718 if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
719 invalid_sequence();
720 // We should not have had an invalidly encoded UTF8 sequence:
721 if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
722 invalid_sequence();
723 }
724 BaseIterator m_position;
725 mutable U32Type m_value;
726 };
727
728 template <class BaseIterator>
729 class utf16_output_iterator
730 {
731 public:
732 typedef void difference_type;
733 typedef void value_type;
734 typedef std::uint32_t* pointer;
735 typedef std::uint32_t& reference;
736 typedef std::output_iterator_tag iterator_category;
737
utf16_output_iterator(const BaseIterator & b)738 utf16_output_iterator(const BaseIterator& b)
739 : m_position(b){}
utf16_output_iterator(const utf16_output_iterator & that)740 utf16_output_iterator(const utf16_output_iterator& that)
741 : m_position(that.m_position){}
operator =(const utf16_output_iterator & that)742 utf16_output_iterator& operator=(const utf16_output_iterator& that)
743 {
744 m_position = that.m_position;
745 return *this;
746 }
operator *() const747 const utf16_output_iterator& operator*()const
748 {
749 return *this;
750 }
operator =(std::uint32_t val) const751 void operator=(std::uint32_t val)const
752 {
753 push(val);
754 }
operator ++()755 utf16_output_iterator& operator++()
756 {
757 return *this;
758 }
operator ++(int)759 utf16_output_iterator& operator++(int)
760 {
761 return *this;
762 }
base() const763 BaseIterator base()const
764 {
765 return m_position;
766 }
767 private:
push(std::uint32_t v) const768 void push(std::uint32_t v)const
769 {
770 if(v >= 0x10000u)
771 {
772 // begin by checking for a code point out of range:
773 if(v > 0x10FFFFu)
774 detail::invalid_utf32_code_point(v);
775 // split into two surrogates:
776 *m_position++ = static_cast<std::uint16_t>(v >> 10) + detail::high_surrogate_base;
777 *m_position++ = static_cast<std::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
778 }
779 else
780 {
781 // 16-bit code point:
782 // value must not be a surrogate:
783 if(detail::is_surrogate(v))
784 detail::invalid_utf32_code_point(v);
785 *m_position++ = static_cast<std::uint16_t>(v);
786 }
787 }
788 mutable BaseIterator m_position;
789 };
790
791 template <class BaseIterator>
792 class utf8_output_iterator
793 {
794 public:
795 typedef void difference_type;
796 typedef void value_type;
797 typedef std::uint32_t* pointer;
798 typedef std::uint32_t& reference;
799 typedef std::output_iterator_tag iterator_category;
800
utf8_output_iterator(const BaseIterator & b)801 utf8_output_iterator(const BaseIterator& b)
802 : m_position(b){}
utf8_output_iterator(const utf8_output_iterator & that)803 utf8_output_iterator(const utf8_output_iterator& that)
804 : m_position(that.m_position){}
operator =(const utf8_output_iterator & that)805 utf8_output_iterator& operator=(const utf8_output_iterator& that)
806 {
807 m_position = that.m_position;
808 return *this;
809 }
operator *() const810 const utf8_output_iterator& operator*()const
811 {
812 return *this;
813 }
operator =(std::uint32_t val) const814 void operator=(std::uint32_t val)const
815 {
816 push(val);
817 }
operator ++()818 utf8_output_iterator& operator++()
819 {
820 return *this;
821 }
operator ++(int)822 utf8_output_iterator& operator++(int)
823 {
824 return *this;
825 }
base() const826 BaseIterator base()const
827 {
828 return m_position;
829 }
830 private:
push(std::uint32_t c) const831 void push(std::uint32_t c)const
832 {
833 if(c > 0x10FFFFu)
834 detail::invalid_utf32_code_point(c);
835 if(c < 0x80u)
836 {
837 *m_position++ = static_cast<unsigned char>(c);
838 }
839 else if(c < 0x800u)
840 {
841 *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
842 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
843 }
844 else if(c < 0x10000u)
845 {
846 *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
847 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
848 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
849 }
850 else
851 {
852 *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
853 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
854 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
855 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
856 }
857 }
858 mutable BaseIterator m_position;
859 };
860
861 } // namespace boost
862
863 #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
864
865