1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2008-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 05/11/2008 Andy Heninger Port from Java
10 **********************************************************************
11 */
12
13 #include <utility>
14
15 #include "unicode/utypes.h"
16
17 #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
18
19 #include "unicode/brkiter.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/uchar.h"
22 #include "unicode/unifilt.h"
23 #include "unicode/uniset.h"
24
25 #include "brktrans.h"
26 #include "cmemory.h"
27 #include "mutex.h"
28 #include "uprops.h"
29 #include "uinvchar.h"
30 #include "util.h"
31 #include "uvectr32.h"
32
33 U_NAMESPACE_BEGIN
34
35 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
36
37 static const UChar SPACE = 32; // ' '
38
39
40 /**
41 * Constructs a transliterator with the default delimiters '{' and
42 * '}'.
43 */
BreakTransliterator(UnicodeFilter * adoptedFilter)44 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
45 Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
46 cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
47 }
48
49
50 /**
51 * Destructor.
52 */
~BreakTransliterator()53 BreakTransliterator::~BreakTransliterator() {
54 }
55
56 /**
57 * Copy constructor.
58 */
BreakTransliterator(const BreakTransliterator & o)59 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
60 Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
61 }
62
63
64 /**
65 * Transliterator API.
66 */
clone() const67 BreakTransliterator* BreakTransliterator::clone() const {
68 return new BreakTransliterator(*this);
69 }
70
71 /**
72 * Implements {@link Transliterator#handleTransliterate}.
73 */
handleTransliterate(Replaceable & text,UTransPosition & offsets,UBool isIncremental) const74 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
75 UBool isIncremental ) const {
76
77 UErrorCode status = U_ZERO_ERROR;
78 LocalPointer<BreakIterator> bi;
79 LocalPointer<UVector32> boundaries;
80
81 {
82 Mutex m;
83 BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
84 boundaries = std::move(nonConstThis->cachedBoundaries);
85 bi = std::move(nonConstThis->cachedBI);
86 }
87 if (bi.isNull()) {
88 bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
89 }
90 if (boundaries.isNull()) {
91 boundaries.adoptInstead(new UVector32(status));
92 }
93
94 if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
95 return;
96 }
97
98 boundaries->removeAllElements();
99 UnicodeString sText = replaceableAsString(text);
100 bi->setText(sText);
101 bi->preceding(offsets.start);
102
103 // To make things much easier, we will stack the boundaries, and then insert at the end.
104 // generally, we won't need too many, since we will be filtered.
105
106 int32_t boundary;
107 for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
108 if (boundary == 0) continue;
109 // HACK: Check to see that preceeding item was a letter
110
111 UChar32 cp = sText.char32At(boundary-1);
112 int type = u_charType(cp);
113 //System.out.println(Integer.toString(cp,16) + " (before): " + type);
114 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
115
116 cp = sText.char32At(boundary);
117 type = u_charType(cp);
118 //System.out.println(Integer.toString(cp,16) + " (after): " + type);
119 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
120
121 boundaries->addElement(boundary, status);
122 // printf("Boundary at %d\n", boundary);
123 }
124
125 int delta = 0;
126 int lastBoundary = 0;
127
128 if (boundaries->size() != 0) { // if we found something, adjust
129 delta = boundaries->size() * fInsertion.length();
130 lastBoundary = boundaries->lastElementi();
131
132 // we do this from the end backwards, so that we don't have to keep updating.
133
134 while (boundaries->size() > 0) {
135 boundary = boundaries->popi();
136 text.handleReplaceBetween(boundary, boundary, fInsertion);
137 }
138 }
139
140 // Now fix up the return values
141 offsets.contextLimit += delta;
142 offsets.limit += delta;
143 offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
144
145 // Return break iterator & boundaries vector to the cache.
146 {
147 Mutex m;
148 BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
149 if (nonConstThis->cachedBI.isNull()) {
150 nonConstThis->cachedBI = std::move(bi);
151 }
152 if (nonConstThis->cachedBoundaries.isNull()) {
153 nonConstThis->cachedBoundaries = std::move(boundaries);
154 }
155 }
156
157 // TODO: do something with U_FAILURE(status);
158 // (need to look at transliterators overall, not just here.)
159 }
160
161 //
162 // getInsertion()
163 //
getInsertion() const164 const UnicodeString &BreakTransliterator::getInsertion() const {
165 return fInsertion;
166 }
167
168 //
169 // setInsertion()
170 //
setInsertion(const UnicodeString & insertion)171 void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
172 this->fInsertion = insertion;
173 }
174
175 //
176 // replaceableAsString Hack to let break iterators work
177 // on the replaceable text from transliterators.
178 // In practice, the only real Replaceable type that we
179 // will be seeing is UnicodeString, so this function
180 // will normally be efficient.
181 //
replaceableAsString(Replaceable & r)182 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
183 UnicodeString s;
184 UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
185 if (rs != NULL) {
186 s = *rs;
187 } else {
188 r.extractBetween(0, r.length(), s);
189 }
190 return s;
191 }
192
193 U_NAMESPACE_END
194
195 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
196