• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  ******************************************************************************
6  *
7  *   Copyright (C) 2009-2015, International Business Machines
8  *   Corporation and others.  All Rights Reserved.
9  *
10  ******************************************************************************
11  */
12 
13 package ohos.global.icu.impl;
14 
15 import ohos.global.icu.text.UnicodeSet.SpanCondition;
16 import ohos.global.icu.util.OutputInt;
17 
18 /**
19  * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
20  *
21  * Latin-1: Look up bytes.
22  * 2-byte characters: Bits organized vertically.
23  * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
24  * Supplementary characters: Binary search over
25  * the supplementary part of the parent set's inversion list.
26  * @hide exposed on OHOS
27  */
28 public final class BMPSet {
29     public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
30 
31     /**
32      * One boolean ('true' or 'false') per Latin-1 character.
33      */
34     private boolean[] latin1Contains;
35 
36     /**
37      * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
38      * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
39      * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
40      *
41      * Bits for 0..FF are unused (0).
42      */
43     private int[] table7FF;
44 
45     /**
46      * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
47      * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
48      * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
49      * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
50      * and set.contains(c) must be called.
51      *
52      * Bits for 0..7FF are unused (0).
53      */
54     private int[] bmpBlockBits;
55 
56     /**
57      * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
58      * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
59      * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
60      */
61     private int[] list4kStarts;
62 
63     /**
64      * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
65      * supplementary code points. The list is terminated with list[listLength-1]=0x110000.
66      */
67     private final int[] list;
68     private final int listLength; // length used; list may be longer to minimize reallocs
69 
BMPSet(final int[] parentList, int parentListLength)70     public BMPSet(final int[] parentList, int parentListLength) {
71         list = parentList;
72         listLength = parentListLength;
73         latin1Contains = new boolean[0x100];
74         table7FF = new int[64];
75         bmpBlockBits = new int[64];
76         list4kStarts = new int[18];
77 
78         /*
79          * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
80          * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
81          * indexes is for finding supplementary code points.
82          */
83         list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
84         int i;
85         for (i = 1; i <= 0x10; ++i) {
86             list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
87         }
88         list4kStarts[0x11] = listLength - 1;
89 
90         initBits();
91     }
92 
BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength)93     public BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength) {
94         list = newParentList;
95         listLength = newParentListLength;
96         latin1Contains = otherBMPSet.latin1Contains.clone();
97         table7FF = otherBMPSet.table7FF.clone();
98         bmpBlockBits = otherBMPSet.bmpBlockBits.clone();
99         list4kStarts = otherBMPSet.list4kStarts.clone();
100     }
101 
contains(int c)102     public boolean contains(int c) {
103         if (c <= 0xff) {
104             return (latin1Contains[c]);
105         } else if (c <= 0x7ff) {
106             return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
107         } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
108             int lead = c >> 12;
109             int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
110             if (twoBits <= 1) {
111                 // All 64 code points with the same bits 15..6
112                 // are either in the set or not.
113                 return (0 != twoBits);
114             } else {
115                 // Look up the code point in its 4k block of code points.
116                 return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
117             }
118         } else if (c <= 0x10ffff) {
119             // surrogate or supplementary code point
120             return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
121         } else {
122             // Out-of-range code points get false, consistent with long-standing
123             // behavior of UnicodeSet.contains(c).
124             return false;
125         }
126     }
127 
128     /**
129      * Span the initial substring for which each character c has spanCondition==contains(c). It must be
130      * spanCondition==0 or 1.
131      *
132      * @param start The start index
133      * @param outCount If not null: Receives the number of code points in the span.
134      * @return the limit (exclusive end) of the span
135      *
136      * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
137      * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
138      * as usual in ICU.
139      */
span(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)140     public final int span(CharSequence s, int start, SpanCondition spanCondition,
141             OutputInt outCount) {
142         char c, c2;
143         int i = start;
144         int limit = s.length();
145         int numSupplementary = 0;
146         if (SpanCondition.NOT_CONTAINED != spanCondition) {
147             // span
148             while (i < limit) {
149                 c = s.charAt(i);
150                 if (c <= 0xff) {
151                     if (!latin1Contains[c]) {
152                         break;
153                     }
154                 } else if (c <= 0x7ff) {
155                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
156                         break;
157                     }
158                 } else if (c < 0xd800 ||
159                            c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
160                     int lead = c >> 12;
161                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
162                     if (twoBits <= 1) {
163                         // All 64 code points with the same bits 15..6
164                         // are either in the set or not.
165                         if (twoBits == 0) {
166                             break;
167                         }
168                     } else {
169                         // Look up the code point in its 4k block of code points.
170                         if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
171                             break;
172                         }
173                     }
174                 } else {
175                     // surrogate pair
176                     int supplementary = Character.toCodePoint(c, c2);
177                     if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
178                         break;
179                     }
180                     ++numSupplementary;
181                     ++i;
182                 }
183                 ++i;
184             }
185         } else {
186             // span not
187             while (i < limit) {
188                 c = s.charAt(i);
189                 if (c <= 0xff) {
190                     if (latin1Contains[c]) {
191                         break;
192                     }
193                 } else if (c <= 0x7ff) {
194                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
195                         break;
196                     }
197                 } else if (c < 0xd800 ||
198                            c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
199                     int lead = c >> 12;
200                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
201                     if (twoBits <= 1) {
202                         // All 64 code points with the same bits 15..6
203                         // are either in the set or not.
204                         if (twoBits != 0) {
205                             break;
206                         }
207                     } else {
208                         // Look up the code point in its 4k block of code points.
209                         if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
210                             break;
211                         }
212                     }
213                 } else {
214                     // surrogate pair
215                     int supplementary = Character.toCodePoint(c, c2);
216                     if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
217                         break;
218                     }
219                     ++numSupplementary;
220                     ++i;
221                 }
222                 ++i;
223             }
224         }
225         if (outCount != null) {
226             int spanLength = i - start;
227             outCount.value = spanLength - numSupplementary;  // number of code points
228         }
229         return i;
230     }
231 
232     /**
233      * Symmetrical with span().
234      * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
235      * limit and spanCondition==0 or 1.
236      *
237      * @return The string index which starts the span (i.e. inclusive).
238      */
spanBack(CharSequence s, int limit, SpanCondition spanCondition)239     public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
240         char c, c2;
241 
242         if (SpanCondition.NOT_CONTAINED != spanCondition) {
243             // span
244             for (;;) {
245                 c = s.charAt(--limit);
246                 if (c <= 0xff) {
247                     if (!latin1Contains[c]) {
248                         break;
249                     }
250                 } else if (c <= 0x7ff) {
251                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
252                         break;
253                     }
254                 } else if (c < 0xd800 ||
255                            c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
256                     int lead = c >> 12;
257                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
258                     if (twoBits <= 1) {
259                         // All 64 code points with the same bits 15..6
260                         // are either in the set or not.
261                         if (twoBits == 0) {
262                             break;
263                         }
264                     } else {
265                         // Look up the code point in its 4k block of code points.
266                         if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
267                             break;
268                         }
269                     }
270                 } else {
271                     // surrogate pair
272                     int supplementary = Character.toCodePoint(c2, c);
273                     if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
274                         break;
275                     }
276                     --limit;
277                 }
278                 if (0 == limit) {
279                     return 0;
280                 }
281             }
282         } else {
283             // span not
284             for (;;) {
285                 c = s.charAt(--limit);
286                 if (c <= 0xff) {
287                     if (latin1Contains[c]) {
288                         break;
289                     }
290                 } else if (c <= 0x7ff) {
291                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
292                         break;
293                     }
294                 } else if (c < 0xd800 ||
295                            c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
296                     int lead = c >> 12;
297                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
298                     if (twoBits <= 1) {
299                         // All 64 code points with the same bits 15..6
300                         // are either in the set or not.
301                         if (twoBits != 0) {
302                             break;
303                         }
304                     } else {
305                         // Look up the code point in its 4k block of code points.
306                         if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
307                             break;
308                         }
309                     }
310                 } else {
311                     // surrogate pair
312                     int supplementary = Character.toCodePoint(c2, c);
313                     if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
314                         break;
315                     }
316                     --limit;
317                 }
318                 if (0 == limit) {
319                     return 0;
320                 }
321             }
322         }
323         return limit + 1;
324     }
325 
326     /**
327      * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
328      */
set32x64Bits(int[] table, int start, int limit)329     private static void set32x64Bits(int[] table, int start, int limit) {
330         assert (64 == table.length);
331         int lead = start >> 6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
332         int trail = start & 0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
333 
334         // Set one bit indicating an all-one block.
335         int bits = 1 << lead;
336         if ((start + 1) == limit) { // Single-character shortcut.
337             table[trail] |= bits;
338             return;
339         }
340 
341         int limitLead = limit >> 6;
342         int limitTrail = limit & 0x3f;
343 
344         if (lead == limitLead) {
345             // Partial vertical bit column.
346             while (trail < limitTrail) {
347                 table[trail++] |= bits;
348             }
349         } else {
350             // Partial vertical bit column,
351             // followed by a bit rectangle,
352             // followed by another partial vertical bit column.
353             if (trail > 0) {
354                 do {
355                     table[trail++] |= bits;
356                 } while (trail < 64);
357                 ++lead;
358             }
359             if (lead < limitLead) {
360                 bits = ~((1 << lead) - 1);
361                 if (limitLead < 0x20) {
362                     bits &= (1 << limitLead) - 1;
363                 }
364                 for (trail = 0; trail < 64; ++trail) {
365                     table[trail] |= bits;
366                 }
367             }
368             // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
369             // In that case, bits=1<<limitLead == 1<<0 == 1
370             // (because Java << uses only the lower 5 bits of the shift operand)
371             // but the bits value is not used because trail<limitTrail is already false.
372             bits = 1 << limitLead;
373             for (trail = 0; trail < limitTrail; ++trail) {
374                 table[trail] |= bits;
375             }
376         }
377     }
378 
initBits()379     private void initBits() {
380         int start, limit;
381         int listIndex = 0;
382 
383         // Set latin1Contains[].
384         do {
385             start = list[listIndex++];
386             if (listIndex < listLength) {
387                 limit = list[listIndex++];
388             } else {
389                 limit = 0x110000;
390             }
391             if (start >= 0x100) {
392                 break;
393             }
394             do {
395                 latin1Contains[start++] = true;
396             } while (start < limit && start < 0x100);
397         } while (limit <= 0x100);
398 
399         // Set table7FF[].
400         while (start < 0x800) {
401             set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
402             if (limit > 0x800) {
403                 start = 0x800;
404                 break;
405             }
406 
407             start = list[listIndex++];
408             if (listIndex < listLength) {
409                 limit = list[listIndex++];
410             } else {
411                 limit = 0x110000;
412             }
413         }
414 
415         // Set bmpBlockBits[].
416         int minStart = 0x800;
417         while (start < 0x10000) {
418             if (limit > 0x10000) {
419                 limit = 0x10000;
420             }
421 
422             if (start < minStart) {
423                 start = minStart;
424             }
425             if (start < limit) { // Else: Another range entirely in a known mixed-value block.
426                 if (0 != (start & 0x3f)) {
427                     // Mixed-value block of 64 code points.
428                     start >>= 6;
429                     bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
430                     start = (start + 1) << 6; // Round up to the next block boundary.
431                     minStart = start; // Ignore further ranges in this block.
432                 }
433                 if (start < limit) {
434                     if (start < (limit & ~0x3f)) {
435                         // Multiple all-ones blocks of 64 code points each.
436                         set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
437                     }
438 
439                     if (0 != (limit & 0x3f)) {
440                         // Mixed-value block of 64 code points.
441                         limit >>= 6;
442                         bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
443                         limit = (limit + 1) << 6; // Round up to the next block boundary.
444                         minStart = limit; // Ignore further ranges in this block.
445                     }
446                 }
447             }
448 
449             if (limit == 0x10000) {
450                 break;
451             }
452 
453             start = list[listIndex++];
454             if (listIndex < listLength) {
455                 limit = list[listIndex++];
456             } else {
457                 limit = 0x110000;
458             }
459         }
460     }
461 
462 
463     /**
464      * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
465      * points in a certain range.
466      *
467      * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
468      * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
469      *
470      * @param c
471      *            a character in a subrange of MIN_VALUE..MAX_VALUE
472      * @param lo
473      *            The lowest index to be returned.
474      * @param hi
475      *            The highest index to be returned.
476      * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]
477      */
findCodePoint(int c, int lo, int hi)478     private int findCodePoint(int c, int lo, int hi) {
479         /* Examples:
480                                            findCodePoint(c)
481            set              list[]         c=0 1 3 4 7 8
482            ===              ==============   ===========
483            []               [110000]         0 0 0 0 0 0
484            [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
485            [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
486            [:Any:]          [0, 110000]      1 1 1 1 1 1
487          */
488 
489         // Return the smallest i such that c < list[i]. Assume
490         // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
491         if (c < list[lo])
492             return lo;
493         // High runner test. c is often after the last range, so an
494         // initial check for this condition pays off.
495         if (lo >= hi || c >= list[hi - 1])
496             return hi;
497         // invariant: c >= list[lo]
498         // invariant: c < list[hi]
499         for (;;) {
500             int i = (lo + hi) >>> 1;
501             if (i == lo) {
502                 break; // Found!
503             } else if (c < list[i]) {
504                 hi = i;
505             } else {
506                 lo = i;
507             }
508         }
509         return hi;
510     }
511 
containsSlow(int c, int lo, int hi)512     private final boolean containsSlow(int c, int lo, int hi) {
513         return (0 != (findCodePoint(c, lo, hi) & 1));
514     }
515 }
516