1 //
2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #define BOOST_LOCALE_SOURCE
9 #include <boost/locale/boundary.hpp>
10 #include <boost/locale/generator.hpp>
11 #include <boost/locale/hold_ptr.hpp>
12 #include <unicode/uversion.h>
13 #if U_ICU_VERSION_MAJOR_NUM*100 + U_ICU_VERSION_MINOR_NUM >= 306
14 #include <unicode/utext.h>
15 #endif
16 #include <unicode/brkiter.h>
17 #include <unicode/rbbi.h>
18
19 #include "cdata.hpp"
20 #include "all_generator.hpp"
21 #include "icu_util.hpp"
22 #include "uconv.hpp"
23
24 namespace boost {
25 namespace locale {
26 namespace boundary {
27 namespace impl_icu {
28
29 using namespace boost::locale::impl_icu;
30
map_direct(boundary_type t,icu::BreakIterator * it,int reserve)31 index_type map_direct(boundary_type t,icu::BreakIterator *it,int reserve)
32 {
33 index_type indx;
34 indx.reserve(reserve);
35 #if U_ICU_VERSION_MAJOR_NUM >= 52
36 icu::BreakIterator *rbbi=it;
37 #else
38 icu::RuleBasedBreakIterator *rbbi=dynamic_cast<icu::RuleBasedBreakIterator *>(it);
39 #endif
40
41 indx.push_back(break_info());
42 it->first();
43 int pos=0;
44 while((pos=it->next())!=icu::BreakIterator::DONE) {
45 indx.push_back(break_info(pos));
46 /// Character does not have any specific break types
47 if(t!=character && rbbi) {
48 //
49 // There is a collapse for MSVC: int32_t defined by both boost::cstdint and icu...
50 // So need to pick one ;(
51 //
52 std::vector< ::int32_t> buffer;
53 ::int32_t membuf[8]={0}; // try not to use memory allocation if possible
54 ::int32_t *buf=membuf;
55
56 UErrorCode err=U_ZERO_ERROR;
57 int n = rbbi->getRuleStatusVec(buf,8,err);
58
59 if(err == U_BUFFER_OVERFLOW_ERROR) {
60 buf=&buffer.front();
61 buffer.resize(n,0);
62 n = rbbi->getRuleStatusVec(buf,buffer.size(),err);
63 }
64
65 check_and_throw_icu_error(err);
66
67 for(int i=0;i<n;i++) {
68 switch(t) {
69 case word:
70 if(UBRK_WORD_NONE<=buf[i] && buf[i]<UBRK_WORD_NONE_LIMIT)
71 indx.back().rule |= word_none;
72 else if(UBRK_WORD_NUMBER<=buf[i] && buf[i]<UBRK_WORD_NUMBER_LIMIT)
73 indx.back().rule |= word_number;
74 else if(UBRK_WORD_LETTER<=buf[i] && buf[i]<UBRK_WORD_LETTER_LIMIT)
75 indx.back().rule |= word_letter;
76 else if(UBRK_WORD_KANA<=buf[i] && buf[i]<UBRK_WORD_KANA_LIMIT)
77 indx.back().rule |= word_kana;
78 else if(UBRK_WORD_IDEO<=buf[i] && buf[i]<UBRK_WORD_IDEO_LIMIT)
79 indx.back().rule |= word_ideo;
80 break;
81
82 case line:
83 if(UBRK_LINE_SOFT<=buf[i] && buf[i]<UBRK_LINE_SOFT_LIMIT)
84 indx.back().rule |= line_soft;
85 else if(UBRK_LINE_HARD<=buf[i] && buf[i]<UBRK_LINE_HARD_LIMIT)
86 indx.back().rule |= line_hard;
87 break;
88
89 case sentence:
90 if(UBRK_SENTENCE_TERM<=buf[i] && buf[i]<UBRK_SENTENCE_TERM_LIMIT)
91 indx.back().rule |= sentence_term;
92 else if(UBRK_SENTENCE_SEP<=buf[i] && buf[i]<UBRK_SENTENCE_SEP_LIMIT)
93 indx.back().rule |= sentence_sep;
94 break;
95 default:
96 ;
97 }
98 }
99 }
100 else {
101 indx.back().rule |=character_any; // Baisc mark... for character
102 }
103 }
104 return indx;
105 }
106
get_iterator(boundary_type t,icu::Locale const & loc)107 icu::BreakIterator *get_iterator(boundary_type t,icu::Locale const &loc)
108 {
109 UErrorCode err=U_ZERO_ERROR;
110 hold_ptr<icu::BreakIterator> bi;
111 switch(t) {
112 case character:
113 bi.reset(icu::BreakIterator::createCharacterInstance(loc,err));
114 break;
115 case word:
116 bi.reset(icu::BreakIterator::createWordInstance(loc,err));
117 break;
118 case sentence:
119 bi.reset(icu::BreakIterator::createSentenceInstance(loc,err));
120 break;
121 case line:
122 bi.reset(icu::BreakIterator::createLineInstance(loc,err));
123 break;
124 default:
125 throw std::runtime_error("Invalid iteration type");
126 }
127 check_and_throw_icu_error(err);
128 if(!bi.get())
129 throw std::runtime_error("Failed to create break iterator");
130 return bi.release();
131 }
132
133
134 template<typename CharType>
do_map(boundary_type t,CharType const * begin,CharType const * end,icu::Locale const & loc,std::string const & encoding)135 index_type do_map(boundary_type t,CharType const *begin,CharType const *end,icu::Locale const &loc,std::string const &encoding)
136 {
137 index_type indx;
138 hold_ptr<icu::BreakIterator> bi(get_iterator(t,loc));
139
140 #if U_ICU_VERSION_MAJOR_NUM*100 + U_ICU_VERSION_MINOR_NUM >= 306
141 UErrorCode err=U_ZERO_ERROR;
142 if(sizeof(CharType) == 2 || (sizeof(CharType)==1 && encoding=="UTF-8"))
143 {
144 UText *ut=0;
145 try {
146 if(sizeof(CharType)==1)
147 ut=utext_openUTF8(0,reinterpret_cast<char const *>(begin),end-begin,&err);
148 else // sizeof(CharType)==2
149 ut=utext_openUChars(0,reinterpret_cast<UChar const *>(begin),end-begin,&err);
150
151 check_and_throw_icu_error(err);
152 err=U_ZERO_ERROR;
153 if(!ut) throw std::runtime_error("Failed to create UText");
154 bi->setText(ut,err);
155 check_and_throw_icu_error(err);
156 index_type res=map_direct(t,bi.get(),end-begin);
157 indx.swap(res);
158 }
159 catch(...) {
160 if(ut)
161 utext_close(ut);
162 throw;
163 }
164 if(ut) utext_close(ut);
165 }
166 else
167 #endif
168 {
169 icu_std_converter<CharType> cvt(encoding);
170 icu::UnicodeString str=cvt.icu(begin,end);
171 bi->setText(str);
172 index_type indirect = map_direct(t,bi.get(),str.length());
173 indx=indirect;
174 for(size_t i=1;i<indirect.size();i++) {
175 size_t offset_inderect=indirect[i-1].offset;
176 size_t diff = indirect[i].offset - offset_inderect;
177 size_t offset_direct=indx[i-1].offset;
178 indx[i].offset=offset_direct + cvt.cut(str,begin,end,diff,offset_inderect,offset_direct);
179 }
180 }
181 return indx;
182 } // do_map
183
184 template<typename CharType>
185 class boundary_indexing_impl : public boundary_indexing<CharType> {
186 public:
boundary_indexing_impl(cdata const & data)187 boundary_indexing_impl(cdata const &data) :
188 locale_(data.locale),
189 encoding_(data.encoding)
190 {
191 }
map(boundary_type t,CharType const * begin,CharType const * end) const192 index_type map(boundary_type t,CharType const *begin,CharType const *end) const
193 {
194 return do_map<CharType>(t,begin,end,locale_,encoding_);
195 }
196 private:
197 icu::Locale locale_;
198 std::string encoding_;
199 };
200
201
202
203 } // impl_icu
204 } // boundary
205
206 namespace impl_icu {
create_boundary(std::locale const & in,cdata const & cd,character_facet_type type)207 std::locale create_boundary(std::locale const &in,cdata const &cd,character_facet_type type)
208 {
209 using namespace boost::locale::boundary::impl_icu;
210 switch(type) {
211 case char_facet:
212 return std::locale(in,new boundary_indexing_impl<char>(cd));
213 case wchar_t_facet:
214 return std::locale(in,new boundary_indexing_impl<wchar_t>(cd));
215 #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
216 case char16_t_facet:
217 return std::locale(in,new boundary_indexing_impl<char16_t>(cd));
218 #endif
219 #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
220 case char32_t_facet:
221 return std::locale(in,new boundary_indexing_impl<char32_t>(cd));
222 #endif
223 default:
224 return in;
225 }
226 }
227 } // impl_icu
228
229 } // locale
230 } // boost
231 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
232