• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /**
5 *******************************************************************************
6 * Copyright (C) 1996-2014, International Business Machines Corporation and
7 * others. All Rights Reserved.
8 *******************************************************************************
9 */
10 package ohos.global.icu.impl.coll;
11 
12 import ohos.global.icu.util.ByteArrayWrapper;
13 
14 /**
15  * <p>Binary Ordered Compression Scheme for Unicode</p>
16  *
17  * <p>Users are strongly encouraged to read the ICU paper on
18  * <a href="http://www.icu-project.org/docs/papers/binary_ordered_compression_for_unicode.html">
19  * BOCU</a> before attempting to use this class.</p>
20  *
21  * <p>BOCU is used to compress unicode text into a stream of unsigned
22  * bytes.  For many kinds of text the compression compares favorably
23  * to UTF-8, and for some kinds of text (such as CJK) it does better.
24  * The resulting bytes will compare in the same order as the original
25  * code points.  The byte stream does not contain the values 0, 1, or
26  * 2.</p>
27  *
28  * <p>One example of a use of BOCU is in
29  * ohos.global.icu.text.Collator#getCollationKey(String) for a RuleBasedCollator object with
30  * collation strength IDENTICAL. The result CollationKey will consist of the
31  * collation order of the source string followed by the BOCU result of the
32  * source string.
33  * </p>
34  *
35  * <p>Unlike a UTF encoding, BOCU-compressed text is not suitable for
36  * random access.</p>
37  *
38  * <p>Method: Slope Detection<br> Remember the previous code point
39  * (initial 0).  For each code point in the string, encode the
40  * difference with the previous one.  Similar to a UTF, the length of
41  * the byte sequence is encoded in the lead bytes.  Unlike a UTF, the
42  * trail byte values may overlap with lead/single byte values.  The
43  * signedness of the difference must be encoded as the most
44  * significant part.</p>
45  *
46  * <p>We encode differences with few bytes if their absolute values
47  * are small.  For correct ordering, we must treat the entire value
48  * range -10ffff..+10ffff in ascending order, which forbids encoding
49  * the sign and the absolute value separately. Instead, we split the
50  * lead byte range in the middle and encode non-negative values going
51  * up and negative values going down.</p>
52  *
53  * <p>For very small absolute values, the difference is added to a
54  * middle byte value for single-byte encoded differences.  For
55  * somewhat larger absolute values, the difference is divided by the
56  * number of byte values available, the modulo is used for one trail
57  * byte, and the remainder is added to a lead byte avoiding the
58  * single-byte range.  For large absolute values, the difference is
59  * similarly encoded in three bytes. (Syn Wee, I need examples
60  * here.)</p>
61  *
62  * <p>BOCU does not use byte values 0, 1, or 2, but uses all other
63  * byte values for lead and single bytes, so that the middle range of
64  * single bytes is as large as possible.</p>
65  *
66  * <p>Note that the lead byte ranges overlap some, but that the
67  * sequences as a whole are well ordered. I.e., even if the lead byte
68  * is the same for sequences of different lengths, the trail bytes
69  * establish correct order.  It would be possible to encode slightly
70  * larger ranges for each length (>1) by subtracting the lower bound
71  * of the range. However, that would also slow down the calculation.
72  * (Syn Wee, need an example).</p>
73  *
74  * <p>For the actual string encoding, an optimization moves the
75  * previous code point value to the middle of its Unicode script block
76  * to minimize the differences in same-script text runs.  (Syn Wee,
77  * need an example.)</p>
78  *
79  * @author Syn Wee Quek
80  * @hide exposed on OHOS
81  */
82 public class BOCSU
83 {
84     // public methods -------------------------------------------------------
85 
86     /**
87      * Encode the code points of a string as
88      * a sequence of byte-encoded differences (slope detection),
89      * preserving lexical order.
90      *
91      * <p>Optimize the difference-taking for runs of Unicode text within
92      * small scripts:
93      *
94      * <p>Most small scripts are allocated within aligned 128-blocks of Unicode
95      * code points. Lexical order is preserved if "prev" is always moved
96      * into the middle of such a block.
97      *
98      * <p>Additionally, "prev" is moved from anywhere in the Unihan
99      * area into the middle of that area.
100      * Note that the identical-level run in a sort key is generated from
101      * NFD text - there are never Hangul characters included.
102      */
writeIdenticalLevelRun(int prev, CharSequence s, int i, int length, ByteArrayWrapper sink)103     public static int writeIdenticalLevelRun(int prev, CharSequence s, int i, int length, ByteArrayWrapper sink) {
104         while (i < length) {
105             // We must have capacity>=SLOPE_MAX_BYTES in case writeDiff() writes that much,
106             // but we do not want to force the sink to allocate
107             // for a large min_capacity because we might actually only write one byte.
108             ensureAppendCapacity(sink, 16, s.length() * 2);
109             byte[] buffer = sink.bytes;
110             int capacity = buffer.length;
111             int p = sink.size;
112             int lastSafe = capacity - SLOPE_MAX_BYTES_;
113             while (i < length && p <= lastSafe) {
114                 if (prev < 0x4e00 || prev >= 0xa000) {
115                     prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
116                 } else {
117                     // Unihan U+4e00..U+9fa5:
118                     // double-bytes down from the upper end
119                     prev = 0x9fff - SLOPE_REACH_POS_2_;
120                 }
121 
122                 int c = Character.codePointAt(s, i);
123                 i += Character.charCount(c);
124                 if (c == 0xfffe) {
125                     buffer[p++] = 2;  // merge separator
126                     prev = 0;
127                 } else {
128                     p = writeDiff(c - prev, buffer, p);
129                     prev = c;
130                 }
131             }
132             sink.size = p;
133         }
134         return prev;
135     }
136 
ensureAppendCapacity(ByteArrayWrapper sink, int minCapacity, int desiredCapacity)137     private static void ensureAppendCapacity(ByteArrayWrapper sink, int minCapacity, int desiredCapacity) {
138         int remainingCapacity = sink.bytes.length - sink.size;
139         if (remainingCapacity >= minCapacity) { return; }
140         if (desiredCapacity < minCapacity) { desiredCapacity = minCapacity; }
141         sink.ensureCapacity(sink.size + desiredCapacity);
142     }
143 
144     // private data members --------------------------------------------------
145 
146     /**
147      * Do not use byte values 0, 1, 2 because they are separators in sort keys.
148      */
149     private static final int SLOPE_MIN_ = 3;
150     private static final int SLOPE_MAX_ = 0xff;
151     private static final int SLOPE_MIDDLE_ = 0x81;
152     private static final int SLOPE_TAIL_COUNT_ = SLOPE_MAX_ - SLOPE_MIN_ + 1;
153     private static final int SLOPE_MAX_BYTES_ = 4;
154 
155     /**
156      * Number of lead bytes:
157      * 1        middle byte for 0
158      * 2*80=160 single bytes for !=0
159      * 2*42=84  for double-byte values
160      * 2*3=6    for 3-byte values
161      * 2*1=2    for 4-byte values
162      *
163      * The sum must be <=SLOPE_TAIL_COUNT.
164      *
165      * Why these numbers?
166      * - There should be >=128 single-byte values to cover 128-blocks
167      *   with small scripts.
168      * - There should be >=20902 single/double-byte values to cover Unihan.
169      * - It helps CJK Extension B some if there are 3-byte values that cover
170      *   the distance between them and Unihan.
171      *   This also helps to jump among distant places in the BMP.
172      * - Four-byte values are necessary to cover the rest of Unicode.
173      *
174      * Symmetrical lead byte counts are for convenience.
175      * With an equal distribution of even and odd differences there is also
176      * no advantage to asymmetrical lead byte counts.
177      */
178     private static final int SLOPE_SINGLE_ = 80;
179     private static final int SLOPE_LEAD_2_ = 42;
180     private static final int SLOPE_LEAD_3_ = 3;
181     //private static final int SLOPE_LEAD_4_ = 1;
182 
183     /**
184      * The difference value range for single-byters.
185      */
186     private static final int SLOPE_REACH_POS_1_ = SLOPE_SINGLE_;
187     private static final int SLOPE_REACH_NEG_1_ = (-SLOPE_SINGLE_);
188 
189     /**
190      * The difference value range for double-byters.
191      */
192     private static final int SLOPE_REACH_POS_2_ =
193         SLOPE_LEAD_2_ * SLOPE_TAIL_COUNT_ + SLOPE_LEAD_2_ - 1;
194     private static final int SLOPE_REACH_NEG_2_ = (-SLOPE_REACH_POS_2_ - 1);
195 
196     /**
197      * The difference value range for 3-byters.
198      */
199     private static final int SLOPE_REACH_POS_3_ = SLOPE_LEAD_3_
200         * SLOPE_TAIL_COUNT_
201         * SLOPE_TAIL_COUNT_
202         + (SLOPE_LEAD_3_ - 1)
203         * SLOPE_TAIL_COUNT_ +
204         (SLOPE_TAIL_COUNT_ - 1);
205     private static final int SLOPE_REACH_NEG_3_ = (-SLOPE_REACH_POS_3_ - 1);
206 
207     /**
208      * The lead byte start values.
209      */
210     private static final int SLOPE_START_POS_2_ = SLOPE_MIDDLE_
211         + SLOPE_SINGLE_ + 1;
212     private static final int SLOPE_START_POS_3_ = SLOPE_START_POS_2_
213         + SLOPE_LEAD_2_;
214     private static final int SLOPE_START_NEG_2_ = SLOPE_MIDDLE_ +
215         SLOPE_REACH_NEG_1_;
216     private static final int SLOPE_START_NEG_3_ = SLOPE_START_NEG_2_
217         - SLOPE_LEAD_2_;
218 
219     // private constructor ---------------------------------------------------
220 
221     /**
222      * Constructor private to prevent initialization
223      */
224     ///CLOVER:OFF
BOCSU()225     private BOCSU()
226     {
227     }
228     ///CLOVER:ON
229 
230     // private methods -------------------------------------------------------
231 
232     /**
233      * Integer division and modulo with negative numerators
234      * yields negative modulo results and quotients that are one more than
235      * what we need here.
236      * @param number which operations are to be performed on
237      * @param factor the factor to use for division
238      * @return (result of division) << 32 | modulo
239      */
getNegDivMod(int number, int factor)240     private static final long getNegDivMod(int number, int factor)
241     {
242         int modulo = number % factor;
243         long result = number / factor;
244         if (modulo < 0) {
245             -- result;
246             modulo += factor;
247         }
248         return (result << 32) | modulo;
249     }
250 
251     /**
252      * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
253      * preserving lexical order
254      * @param diff
255      * @param buffer byte buffer to append to
256      * @param offset to the byte buffer to start appending
257      * @return end offset where the appending stops
258      */
writeDiff(int diff, byte buffer[], int offset)259     private static final int writeDiff(int diff, byte buffer[], int offset)
260     {
261         if (diff >= SLOPE_REACH_NEG_1_) {
262             if (diff <= SLOPE_REACH_POS_1_) {
263                 buffer[offset ++] = (byte)(SLOPE_MIDDLE_ + diff);
264             }
265             else if (diff <= SLOPE_REACH_POS_2_) {
266                 buffer[offset ++] = (byte)(SLOPE_START_POS_2_
267                                            + (diff / SLOPE_TAIL_COUNT_));
268                 buffer[offset ++] = (byte)(SLOPE_MIN_ +
269                                            (diff % SLOPE_TAIL_COUNT_));
270             }
271             else if (diff <= SLOPE_REACH_POS_3_) {
272                 buffer[offset + 2] = (byte)(SLOPE_MIN_
273                                             + (diff % SLOPE_TAIL_COUNT_));
274                 diff /= SLOPE_TAIL_COUNT_;
275                 buffer[offset + 1] = (byte)(SLOPE_MIN_
276                                             + (diff % SLOPE_TAIL_COUNT_));
277                 buffer[offset] = (byte)(SLOPE_START_POS_3_
278                                         + (diff / SLOPE_TAIL_COUNT_));
279                 offset += 3;
280             }
281             else {
282                 buffer[offset + 3] = (byte)(SLOPE_MIN_
283                                             + diff % SLOPE_TAIL_COUNT_);
284                 diff /= SLOPE_TAIL_COUNT_;
285                 buffer[offset + 2] = (byte)(SLOPE_MIN_
286                                         + diff % SLOPE_TAIL_COUNT_);
287                 diff /= SLOPE_TAIL_COUNT_;
288                 buffer[offset + 1] = (byte)(SLOPE_MIN_
289                                             + diff % SLOPE_TAIL_COUNT_);
290                 buffer[offset] = (byte)SLOPE_MAX_;
291                 offset += 4;
292             }
293         }
294         else {
295             long division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
296             int modulo = (int)division;
297             if (diff >= SLOPE_REACH_NEG_2_) {
298                 diff = (int)(division >> 32);
299                 buffer[offset ++] = (byte)(SLOPE_START_NEG_2_ + diff);
300                 buffer[offset ++] = (byte)(SLOPE_MIN_ + modulo);
301             }
302             else if (diff >= SLOPE_REACH_NEG_3_) {
303                 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);
304                 diff = (int)(division >> 32);
305                 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
306                 modulo = (int)division;
307                 diff = (int)(division >> 32);
308                 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);
309                 buffer[offset] = (byte)(SLOPE_START_NEG_3_ + diff);
310                 offset += 3;
311             }
312             else {
313                 buffer[offset + 3] = (byte)(SLOPE_MIN_ + modulo);
314                 diff = (int)(division >> 32);
315                 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
316                 modulo = (int)division;
317                 diff = (int)(division >> 32);
318                 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);
319                 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
320                 modulo = (int)division;
321                 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);
322                 buffer[offset] = SLOPE_MIN_;
323                 offset += 4;
324             }
325         }
326         return offset;
327     }
328 }
329