• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 //  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 //  Distributed under the Boost Software License, Version 1.0. (See
5 //  accompanying file LICENSE_1_0.txt or copy at
6 //  http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #define BOOST_LOCALE_SOURCE
9 #include <boost/locale/boundary.hpp>
10 #include <boost/locale/generator.hpp>
11 #include <boost/locale/hold_ptr.hpp>
12 #include <unicode/uversion.h>
13 #if U_ICU_VERSION_MAJOR_NUM*100 + U_ICU_VERSION_MINOR_NUM >= 306
14 #include <unicode/utext.h>
15 #endif
16 #include <unicode/brkiter.h>
17 #include <unicode/rbbi.h>
18 
19 #include "cdata.hpp"
20 #include "all_generator.hpp"
21 #include "icu_util.hpp"
22 #include "uconv.hpp"
23 
24 namespace boost {
25 namespace locale {
26 namespace boundary {
27 namespace impl_icu {
28 
29 using namespace boost::locale::impl_icu;
30 
map_direct(boundary_type t,icu::BreakIterator * it,int reserve)31 index_type map_direct(boundary_type t,icu::BreakIterator *it,int reserve)
32 {
33     index_type indx;
34     indx.reserve(reserve);
35 #if U_ICU_VERSION_MAJOR_NUM >= 52
36     icu::BreakIterator *rbbi=it;
37 #else
38     icu::RuleBasedBreakIterator *rbbi=dynamic_cast<icu::RuleBasedBreakIterator *>(it);
39 #endif
40 
41     indx.push_back(break_info());
42     it->first();
43     int pos=0;
44     while((pos=it->next())!=icu::BreakIterator::DONE) {
45         indx.push_back(break_info(pos));
46         /// Character does not have any specific break types
47         if(t!=character && rbbi) {
48             //
49             // There is a collapse for MSVC: int32_t defined by both boost::cstdint and icu...
50             // So need to pick one ;(
51             //
52             std::vector< ::int32_t> buffer;
53             ::int32_t membuf[8]={0}; // try not to use memory allocation if possible
54             ::int32_t *buf=membuf;
55 
56             UErrorCode err=U_ZERO_ERROR;
57             int n = rbbi->getRuleStatusVec(buf,8,err);
58 
59             if(err == U_BUFFER_OVERFLOW_ERROR) {
60                 buf=&buffer.front();
61                 buffer.resize(n,0);
62                 n = rbbi->getRuleStatusVec(buf,buffer.size(),err);
63             }
64 
65             check_and_throw_icu_error(err);
66 
67             for(int i=0;i<n;i++) {
68                 switch(t) {
69                 case word:
70                     if(UBRK_WORD_NONE<=buf[i] && buf[i]<UBRK_WORD_NONE_LIMIT)
71                         indx.back().rule |= word_none;
72                     else if(UBRK_WORD_NUMBER<=buf[i] && buf[i]<UBRK_WORD_NUMBER_LIMIT)
73                         indx.back().rule |= word_number;
74                     else if(UBRK_WORD_LETTER<=buf[i] && buf[i]<UBRK_WORD_LETTER_LIMIT)
75                         indx.back().rule |= word_letter;
76                     else if(UBRK_WORD_KANA<=buf[i] && buf[i]<UBRK_WORD_KANA_LIMIT)
77                         indx.back().rule |= word_kana;
78                     else if(UBRK_WORD_IDEO<=buf[i] && buf[i]<UBRK_WORD_IDEO_LIMIT)
79                         indx.back().rule |= word_ideo;
80                     break;
81 
82                 case line:
83                     if(UBRK_LINE_SOFT<=buf[i] && buf[i]<UBRK_LINE_SOFT_LIMIT)
84                         indx.back().rule |= line_soft;
85                     else if(UBRK_LINE_HARD<=buf[i] && buf[i]<UBRK_LINE_HARD_LIMIT)
86                         indx.back().rule |= line_hard;
87                     break;
88 
89                 case sentence:
90                     if(UBRK_SENTENCE_TERM<=buf[i] && buf[i]<UBRK_SENTENCE_TERM_LIMIT)
91                         indx.back().rule |= sentence_term;
92                     else if(UBRK_SENTENCE_SEP<=buf[i] && buf[i]<UBRK_SENTENCE_SEP_LIMIT)
93                         indx.back().rule |= sentence_sep;
94                     break;
95                 default:
96                     ;
97                 }
98             }
99         }
100         else {
101             indx.back().rule |=character_any; // Baisc mark... for character
102         }
103     }
104     return indx;
105 }
106 
get_iterator(boundary_type t,icu::Locale const & loc)107 icu::BreakIterator *get_iterator(boundary_type t,icu::Locale const &loc)
108 {
109     UErrorCode err=U_ZERO_ERROR;
110     hold_ptr<icu::BreakIterator> bi;
111     switch(t) {
112     case character:
113         bi.reset(icu::BreakIterator::createCharacterInstance(loc,err));
114         break;
115     case word:
116         bi.reset(icu::BreakIterator::createWordInstance(loc,err));
117         break;
118     case sentence:
119         bi.reset(icu::BreakIterator::createSentenceInstance(loc,err));
120         break;
121     case line:
122         bi.reset(icu::BreakIterator::createLineInstance(loc,err));
123         break;
124     default:
125         throw std::runtime_error("Invalid iteration type");
126     }
127     check_and_throw_icu_error(err);
128     if(!bi.get())
129         throw std::runtime_error("Failed to create break iterator");
130     return bi.release();
131 }
132 
133 
134 template<typename CharType>
do_map(boundary_type t,CharType const * begin,CharType const * end,icu::Locale const & loc,std::string const & encoding)135 index_type do_map(boundary_type t,CharType const *begin,CharType const *end,icu::Locale const &loc,std::string const &encoding)
136 {
137     index_type indx;
138     hold_ptr<icu::BreakIterator> bi(get_iterator(t,loc));
139 
140 #if U_ICU_VERSION_MAJOR_NUM*100 + U_ICU_VERSION_MINOR_NUM >= 306
141     UErrorCode err=U_ZERO_ERROR;
142     if(sizeof(CharType) == 2 || (sizeof(CharType)==1 && encoding=="UTF-8"))
143     {
144         UText *ut=0;
145         try {
146             if(sizeof(CharType)==1)
147                 ut=utext_openUTF8(0,reinterpret_cast<char const *>(begin),end-begin,&err);
148             else // sizeof(CharType)==2
149                 ut=utext_openUChars(0,reinterpret_cast<UChar const *>(begin),end-begin,&err);
150 
151             check_and_throw_icu_error(err);
152             err=U_ZERO_ERROR;
153             if(!ut) throw std::runtime_error("Failed to create UText");
154             bi->setText(ut,err);
155             check_and_throw_icu_error(err);
156             index_type res=map_direct(t,bi.get(),end-begin);
157             indx.swap(res);
158         }
159         catch(...) {
160             if(ut)
161                 utext_close(ut);
162             throw;
163         }
164         if(ut) utext_close(ut);
165     }
166     else
167 #endif
168     {
169         icu_std_converter<CharType> cvt(encoding);
170         icu::UnicodeString str=cvt.icu(begin,end);
171         bi->setText(str);
172         index_type indirect = map_direct(t,bi.get(),str.length());
173         indx=indirect;
174         for(size_t i=1;i<indirect.size();i++) {
175             size_t offset_inderect=indirect[i-1].offset;
176             size_t diff = indirect[i].offset - offset_inderect;
177             size_t offset_direct=indx[i-1].offset;
178             indx[i].offset=offset_direct + cvt.cut(str,begin,end,diff,offset_inderect,offset_direct);
179         }
180     }
181     return indx;
182 } // do_map
183 
184 template<typename CharType>
185 class boundary_indexing_impl : public boundary_indexing<CharType> {
186 public:
boundary_indexing_impl(cdata const & data)187     boundary_indexing_impl(cdata const &data) :
188         locale_(data.locale),
189         encoding_(data.encoding)
190     {
191     }
map(boundary_type t,CharType const * begin,CharType const * end) const192     index_type map(boundary_type t,CharType const *begin,CharType const *end) const
193     {
194         return do_map<CharType>(t,begin,end,locale_,encoding_);
195     }
196 private:
197     icu::Locale locale_;
198     std::string encoding_;
199 };
200 
201 
202 
203 } // impl_icu
204 } // boundary
205 
206 namespace impl_icu {
create_boundary(std::locale const & in,cdata const & cd,character_facet_type type)207     std::locale create_boundary(std::locale const &in,cdata const &cd,character_facet_type type)
208     {
209         using namespace boost::locale::boundary::impl_icu;
210         switch(type) {
211         case char_facet:
212             return std::locale(in,new boundary_indexing_impl<char>(cd));
213         case wchar_t_facet:
214             return std::locale(in,new boundary_indexing_impl<wchar_t>(cd));
215         #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
216         case char16_t_facet:
217             return std::locale(in,new boundary_indexing_impl<char16_t>(cd));
218         #endif
219         #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
220         case char32_t_facet:
221             return std::locale(in,new boundary_indexing_impl<char32_t>(cd));
222         #endif
223         default:
224             return in;
225         }
226     }
227 } // impl_icu
228 
229 } // locale
230 } // boost
231 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
232