1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /** 5 ******************************************************************************* 6 * Copyright (C) 1996-2014, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package ohos.global.icu.impl.coll; 11 12 import ohos.global.icu.util.ByteArrayWrapper; 13 14 /** 15 * <p>Binary Ordered Compression Scheme for Unicode</p> 16 * 17 * <p>Users are strongly encouraged to read the ICU paper on 18 * <a href="http://www.icu-project.org/docs/papers/binary_ordered_compression_for_unicode.html"> 19 * BOCU</a> before attempting to use this class.</p> 20 * 21 * <p>BOCU is used to compress unicode text into a stream of unsigned 22 * bytes. For many kinds of text the compression compares favorably 23 * to UTF-8, and for some kinds of text (such as CJK) it does better. 24 * The resulting bytes will compare in the same order as the original 25 * code points. The byte stream does not contain the values 0, 1, or 26 * 2.</p> 27 * 28 * <p>One example of a use of BOCU is in 29 * ohos.global.icu.text.Collator#getCollationKey(String) for a RuleBasedCollator object with 30 * collation strength IDENTICAL. The result CollationKey will consist of the 31 * collation order of the source string followed by the BOCU result of the 32 * source string. 33 * </p> 34 * 35 * <p>Unlike a UTF encoding, BOCU-compressed text is not suitable for 36 * random access.</p> 37 * 38 * <p>Method: Slope Detection<br> Remember the previous code point 39 * (initial 0). For each code point in the string, encode the 40 * difference with the previous one. Similar to a UTF, the length of 41 * the byte sequence is encoded in the lead bytes. Unlike a UTF, the 42 * trail byte values may overlap with lead/single byte values. The 43 * signedness of the difference must be encoded as the most 44 * significant part.</p> 45 * 46 * <p>We encode differences with few bytes if their absolute values 47 * are small. For correct ordering, we must treat the entire value 48 * range -10ffff..+10ffff in ascending order, which forbids encoding 49 * the sign and the absolute value separately. Instead, we split the 50 * lead byte range in the middle and encode non-negative values going 51 * up and negative values going down.</p> 52 * 53 * <p>For very small absolute values, the difference is added to a 54 * middle byte value for single-byte encoded differences. For 55 * somewhat larger absolute values, the difference is divided by the 56 * number of byte values available, the modulo is used for one trail 57 * byte, and the remainder is added to a lead byte avoiding the 58 * single-byte range. For large absolute values, the difference is 59 * similarly encoded in three bytes. (Syn Wee, I need examples 60 * here.)</p> 61 * 62 * <p>BOCU does not use byte values 0, 1, or 2, but uses all other 63 * byte values for lead and single bytes, so that the middle range of 64 * single bytes is as large as possible.</p> 65 * 66 * <p>Note that the lead byte ranges overlap some, but that the 67 * sequences as a whole are well ordered. I.e., even if the lead byte 68 * is the same for sequences of different lengths, the trail bytes 69 * establish correct order. It would be possible to encode slightly 70 * larger ranges for each length (>1) by subtracting the lower bound 71 * of the range. However, that would also slow down the calculation. 72 * (Syn Wee, need an example).</p> 73 * 74 * <p>For the actual string encoding, an optimization moves the 75 * previous code point value to the middle of its Unicode script block 76 * to minimize the differences in same-script text runs. (Syn Wee, 77 * need an example.)</p> 78 * 79 * @author Syn Wee Quek 80 * @hide exposed on OHOS 81 */ 82 public class BOCSU 83 { 84 // public methods ------------------------------------------------------- 85 86 /** 87 * Encode the code points of a string as 88 * a sequence of byte-encoded differences (slope detection), 89 * preserving lexical order. 90 * 91 * <p>Optimize the difference-taking for runs of Unicode text within 92 * small scripts: 93 * 94 * <p>Most small scripts are allocated within aligned 128-blocks of Unicode 95 * code points. Lexical order is preserved if "prev" is always moved 96 * into the middle of such a block. 97 * 98 * <p>Additionally, "prev" is moved from anywhere in the Unihan 99 * area into the middle of that area. 100 * Note that the identical-level run in a sort key is generated from 101 * NFD text - there are never Hangul characters included. 102 */ writeIdenticalLevelRun(int prev, CharSequence s, int i, int length, ByteArrayWrapper sink)103 public static int writeIdenticalLevelRun(int prev, CharSequence s, int i, int length, ByteArrayWrapper sink) { 104 while (i < length) { 105 // We must have capacity>=SLOPE_MAX_BYTES in case writeDiff() writes that much, 106 // but we do not want to force the sink to allocate 107 // for a large min_capacity because we might actually only write one byte. 108 ensureAppendCapacity(sink, 16, s.length() * 2); 109 byte[] buffer = sink.bytes; 110 int capacity = buffer.length; 111 int p = sink.size; 112 int lastSafe = capacity - SLOPE_MAX_BYTES_; 113 while (i < length && p <= lastSafe) { 114 if (prev < 0x4e00 || prev >= 0xa000) { 115 prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_; 116 } else { 117 // Unihan U+4e00..U+9fa5: 118 // double-bytes down from the upper end 119 prev = 0x9fff - SLOPE_REACH_POS_2_; 120 } 121 122 int c = Character.codePointAt(s, i); 123 i += Character.charCount(c); 124 if (c == 0xfffe) { 125 buffer[p++] = 2; // merge separator 126 prev = 0; 127 } else { 128 p = writeDiff(c - prev, buffer, p); 129 prev = c; 130 } 131 } 132 sink.size = p; 133 } 134 return prev; 135 } 136 ensureAppendCapacity(ByteArrayWrapper sink, int minCapacity, int desiredCapacity)137 private static void ensureAppendCapacity(ByteArrayWrapper sink, int minCapacity, int desiredCapacity) { 138 int remainingCapacity = sink.bytes.length - sink.size; 139 if (remainingCapacity >= minCapacity) { return; } 140 if (desiredCapacity < minCapacity) { desiredCapacity = minCapacity; } 141 sink.ensureCapacity(sink.size + desiredCapacity); 142 } 143 144 // private data members -------------------------------------------------- 145 146 /** 147 * Do not use byte values 0, 1, 2 because they are separators in sort keys. 148 */ 149 private static final int SLOPE_MIN_ = 3; 150 private static final int SLOPE_MAX_ = 0xff; 151 private static final int SLOPE_MIDDLE_ = 0x81; 152 private static final int SLOPE_TAIL_COUNT_ = SLOPE_MAX_ - SLOPE_MIN_ + 1; 153 private static final int SLOPE_MAX_BYTES_ = 4; 154 155 /** 156 * Number of lead bytes: 157 * 1 middle byte for 0 158 * 2*80=160 single bytes for !=0 159 * 2*42=84 for double-byte values 160 * 2*3=6 for 3-byte values 161 * 2*1=2 for 4-byte values 162 * 163 * The sum must be <=SLOPE_TAIL_COUNT. 164 * 165 * Why these numbers? 166 * - There should be >=128 single-byte values to cover 128-blocks 167 * with small scripts. 168 * - There should be >=20902 single/double-byte values to cover Unihan. 169 * - It helps CJK Extension B some if there are 3-byte values that cover 170 * the distance between them and Unihan. 171 * This also helps to jump among distant places in the BMP. 172 * - Four-byte values are necessary to cover the rest of Unicode. 173 * 174 * Symmetrical lead byte counts are for convenience. 175 * With an equal distribution of even and odd differences there is also 176 * no advantage to asymmetrical lead byte counts. 177 */ 178 private static final int SLOPE_SINGLE_ = 80; 179 private static final int SLOPE_LEAD_2_ = 42; 180 private static final int SLOPE_LEAD_3_ = 3; 181 //private static final int SLOPE_LEAD_4_ = 1; 182 183 /** 184 * The difference value range for single-byters. 185 */ 186 private static final int SLOPE_REACH_POS_1_ = SLOPE_SINGLE_; 187 private static final int SLOPE_REACH_NEG_1_ = (-SLOPE_SINGLE_); 188 189 /** 190 * The difference value range for double-byters. 191 */ 192 private static final int SLOPE_REACH_POS_2_ = 193 SLOPE_LEAD_2_ * SLOPE_TAIL_COUNT_ + SLOPE_LEAD_2_ - 1; 194 private static final int SLOPE_REACH_NEG_2_ = (-SLOPE_REACH_POS_2_ - 1); 195 196 /** 197 * The difference value range for 3-byters. 198 */ 199 private static final int SLOPE_REACH_POS_3_ = SLOPE_LEAD_3_ 200 * SLOPE_TAIL_COUNT_ 201 * SLOPE_TAIL_COUNT_ 202 + (SLOPE_LEAD_3_ - 1) 203 * SLOPE_TAIL_COUNT_ + 204 (SLOPE_TAIL_COUNT_ - 1); 205 private static final int SLOPE_REACH_NEG_3_ = (-SLOPE_REACH_POS_3_ - 1); 206 207 /** 208 * The lead byte start values. 209 */ 210 private static final int SLOPE_START_POS_2_ = SLOPE_MIDDLE_ 211 + SLOPE_SINGLE_ + 1; 212 private static final int SLOPE_START_POS_3_ = SLOPE_START_POS_2_ 213 + SLOPE_LEAD_2_; 214 private static final int SLOPE_START_NEG_2_ = SLOPE_MIDDLE_ + 215 SLOPE_REACH_NEG_1_; 216 private static final int SLOPE_START_NEG_3_ = SLOPE_START_NEG_2_ 217 - SLOPE_LEAD_2_; 218 219 // private constructor --------------------------------------------------- 220 221 /** 222 * Constructor private to prevent initialization 223 */ 224 ///CLOVER:OFF BOCSU()225 private BOCSU() 226 { 227 } 228 ///CLOVER:ON 229 230 // private methods ------------------------------------------------------- 231 232 /** 233 * Integer division and modulo with negative numerators 234 * yields negative modulo results and quotients that are one more than 235 * what we need here. 236 * @param number which operations are to be performed on 237 * @param factor the factor to use for division 238 * @return (result of division) << 32 | modulo 239 */ getNegDivMod(int number, int factor)240 private static final long getNegDivMod(int number, int factor) 241 { 242 int modulo = number % factor; 243 long result = number / factor; 244 if (modulo < 0) { 245 -- result; 246 modulo += factor; 247 } 248 return (result << 32) | modulo; 249 } 250 251 /** 252 * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes, 253 * preserving lexical order 254 * @param diff 255 * @param buffer byte buffer to append to 256 * @param offset to the byte buffer to start appending 257 * @return end offset where the appending stops 258 */ writeDiff(int diff, byte buffer[], int offset)259 private static final int writeDiff(int diff, byte buffer[], int offset) 260 { 261 if (diff >= SLOPE_REACH_NEG_1_) { 262 if (diff <= SLOPE_REACH_POS_1_) { 263 buffer[offset ++] = (byte)(SLOPE_MIDDLE_ + diff); 264 } 265 else if (diff <= SLOPE_REACH_POS_2_) { 266 buffer[offset ++] = (byte)(SLOPE_START_POS_2_ 267 + (diff / SLOPE_TAIL_COUNT_)); 268 buffer[offset ++] = (byte)(SLOPE_MIN_ + 269 (diff % SLOPE_TAIL_COUNT_)); 270 } 271 else if (diff <= SLOPE_REACH_POS_3_) { 272 buffer[offset + 2] = (byte)(SLOPE_MIN_ 273 + (diff % SLOPE_TAIL_COUNT_)); 274 diff /= SLOPE_TAIL_COUNT_; 275 buffer[offset + 1] = (byte)(SLOPE_MIN_ 276 + (diff % SLOPE_TAIL_COUNT_)); 277 buffer[offset] = (byte)(SLOPE_START_POS_3_ 278 + (diff / SLOPE_TAIL_COUNT_)); 279 offset += 3; 280 } 281 else { 282 buffer[offset + 3] = (byte)(SLOPE_MIN_ 283 + diff % SLOPE_TAIL_COUNT_); 284 diff /= SLOPE_TAIL_COUNT_; 285 buffer[offset + 2] = (byte)(SLOPE_MIN_ 286 + diff % SLOPE_TAIL_COUNT_); 287 diff /= SLOPE_TAIL_COUNT_; 288 buffer[offset + 1] = (byte)(SLOPE_MIN_ 289 + diff % SLOPE_TAIL_COUNT_); 290 buffer[offset] = (byte)SLOPE_MAX_; 291 offset += 4; 292 } 293 } 294 else { 295 long division = getNegDivMod(diff, SLOPE_TAIL_COUNT_); 296 int modulo = (int)division; 297 if (diff >= SLOPE_REACH_NEG_2_) { 298 diff = (int)(division >> 32); 299 buffer[offset ++] = (byte)(SLOPE_START_NEG_2_ + diff); 300 buffer[offset ++] = (byte)(SLOPE_MIN_ + modulo); 301 } 302 else if (diff >= SLOPE_REACH_NEG_3_) { 303 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo); 304 diff = (int)(division >> 32); 305 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_); 306 modulo = (int)division; 307 diff = (int)(division >> 32); 308 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo); 309 buffer[offset] = (byte)(SLOPE_START_NEG_3_ + diff); 310 offset += 3; 311 } 312 else { 313 buffer[offset + 3] = (byte)(SLOPE_MIN_ + modulo); 314 diff = (int)(division >> 32); 315 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_); 316 modulo = (int)division; 317 diff = (int)(division >> 32); 318 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo); 319 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_); 320 modulo = (int)division; 321 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo); 322 buffer[offset] = SLOPE_MIN_; 323 offset += 4; 324 } 325 } 326 return offset; 327 } 328 } 329