• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 ******************************************************************************
5 * Copyright (C) 1996-2010, International Business Machines Corporation and   *
6 * others. All Rights Reserved.                                               *
7 ******************************************************************************
8 */
9 
10 package com.ibm.icu.impl;
11 
12 import java.io.DataOutputStream;
13 import java.io.IOException;
14 import java.io.OutputStream;
15 import java.util.Arrays;
16 
17 import com.ibm.icu.lang.UCharacter;
18 import com.ibm.icu.text.UTF16;
19 
20 /**
21  * Builder class to manipulate and generate a trie.
22  * This is useful for ICU data in primitive types.
23  * Provides a compact way to store information that is indexed by Unicode
24  * values, such as character properties, types, keyboard values, etc. This is
25  * very useful when you have a block of Unicode data that contains significant
26  * values while the rest of the Unicode data is unused in the application or
27  * when you have a lot of redundance, such as where all 21,000 Han ideographs
28  * have the same value.  However, lookup is much faster than a hash table.
29  * A trie of any primitive data type serves two purposes:
30  * <UL type = round>
31  *     <LI>Fast access of the indexed values.
32  *     <LI>Smaller memory footprint.
33  * </UL>
34  * This is a direct port from the ICU4C version
35  * @author             Syn Wee Quek
36  */
37 public class IntTrieBuilder extends TrieBuilder
38 {
39     // public constructor ----------------------------------------------
40 
41     /**
42      * Copy constructor
43      */
IntTrieBuilder(IntTrieBuilder table)44     public IntTrieBuilder(IntTrieBuilder table)
45     {
46         super(table);
47         m_data_ = new int[m_dataCapacity_];
48         System.arraycopy(table.m_data_, 0, m_data_, 0, m_dataLength_);
49         m_initialValue_ = table.m_initialValue_;
50         m_leadUnitValue_ = table.m_leadUnitValue_;
51     }
52 
53     /**
54      * Constructs a build table
55      * @param aliasdata data to be filled into table
56      * @param maxdatalength maximum data length allowed in table
57      * @param initialvalue inital data value
58      * @param latin1linear is latin 1 to be linear
59      */
IntTrieBuilder(int aliasdata[], int maxdatalength, int initialvalue, int leadunitvalue, boolean latin1linear)60     public IntTrieBuilder(int aliasdata[], int maxdatalength,
61                           int initialvalue, int leadunitvalue,
62                           boolean latin1linear)
63     {
64         super();
65         if (maxdatalength < DATA_BLOCK_LENGTH || (latin1linear
66                                                   && maxdatalength < 1024)) {
67             throw new IllegalArgumentException(
68                                                "Argument maxdatalength is too small");
69         }
70 
71         if (aliasdata != null) {
72             m_data_ = aliasdata;
73         }
74         else {
75             m_data_ = new int[maxdatalength];
76         }
77 
78         // preallocate and reset the first data block (block index 0)
79         int j = DATA_BLOCK_LENGTH;
80 
81         if (latin1linear) {
82             // preallocate and reset the first block (number 0) and Latin-1
83             // (U+0000..U+00ff) after that made sure above that
84             // maxDataLength >= 1024
85             // set indexes to point to consecutive data blocks
86             int i = 0;
87             do {
88                 // do this at least for trie->index[0] even if that block is
89                 // only partly used for Latin-1
90                 m_index_[i ++] = j;
91                 j += DATA_BLOCK_LENGTH;
92             } while (i < (256 >> SHIFT_));
93         }
94 
95         m_dataLength_ = j;
96         // reset the initially allocated blocks to the initial value
97         Arrays.fill(m_data_, 0, m_dataLength_, initialvalue);
98         m_initialValue_ = initialvalue;
99         m_leadUnitValue_ = leadunitvalue;
100         m_dataCapacity_ = maxdatalength;
101         m_isLatin1Linear_ = latin1linear;
102         m_isCompacted_ = false;
103     }
104 
105     // public methods -------------------------------------------------------
106 
107     /*public final void print()
108       {
109       int i = 0;
110       int oldvalue = m_index_[i];
111       int count = 0;
112       System.out.println("index length " + m_indexLength_
113       + " --------------------------");
114       while (i < m_indexLength_) {
115       if (m_index_[i] != oldvalue) {
116       System.out.println("index has " + count + " counts of "
117       + Integer.toHexString(oldvalue));
118       count = 0;
119       oldvalue = m_index_[i];
120       }
121       count ++;
122       i ++;
123       }
124       System.out.println("index has " + count + " counts of "
125       + Integer.toHexString(oldvalue));
126       i = 0;
127       oldvalue = m_data_[i];
128       count = 0;
129       System.out.println("data length " + m_dataLength_
130       + " --------------------------");
131       while (i < m_dataLength_) {
132       if (m_data_[i] != oldvalue) {
133       if ((oldvalue & 0xf1000000) == 0xf1000000) {
134       int temp = oldvalue & 0xffffff;
135       temp += 0x320;
136       oldvalue = 0xf1000000 | temp;
137       }
138       if ((oldvalue & 0xf2000000) == 0xf2000000) {
139       int temp = oldvalue & 0xffffff;
140       temp += 0x14a;
141       oldvalue = 0xf2000000 | temp;
142       }
143       System.out.println("data has " + count + " counts of "
144       + Integer.toHexString(oldvalue));
145       count = 0;
146       oldvalue = m_data_[i];
147       }
148       count ++;
149       i ++;
150       }
151       if ((oldvalue & 0xf1000000) == 0xf1000000) {
152       int temp = oldvalue & 0xffffff;
153       temp += 0x320;
154       oldvalue = 0xf1000000 | temp;
155       }
156       if ((oldvalue & 0xf2000000) == 0xf2000000) {
157       int temp = oldvalue & 0xffffff;
158       temp += 0x14a;
159       oldvalue = 0xf2000000 | temp;
160       }
161       System.out.println("data has " + count + " counts of "
162       + Integer.toHexString(oldvalue));
163       }
164     */
165     /**
166      * Gets a 32 bit data from the table data
167      * @param ch codepoint which data is to be retrieved
168      * @return the 32 bit data
169      */
getValue(int ch)170     public int getValue(int ch)
171     {
172         // valid, uncompacted trie and valid c?
173         if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
174             return 0;
175         }
176 
177         int block = m_index_[ch >> SHIFT_];
178         return m_data_[Math.abs(block) + (ch & MASK_)];
179     }
180 
181     /**
182      * Get a 32 bit data from the table data
183      * @param ch  code point for which data is to be retrieved.
184      * @param inBlockZero  Output parameter, inBlockZero[0] returns true if the
185      *                      char maps into block zero, otherwise false.
186      * @return the 32 bit data value.
187      */
getValue(int ch, boolean [] inBlockZero)188     public int getValue(int ch, boolean [] inBlockZero)
189     {
190         // valid, uncompacted trie and valid c?
191         if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
192             if (inBlockZero != null) {
193                 inBlockZero[0] = true;
194             }
195             return 0;
196         }
197 
198         int block = m_index_[ch >> SHIFT_];
199         if (inBlockZero != null) {
200             inBlockZero[0] = (block == 0);
201         }
202         return m_data_[Math.abs(block) + (ch & MASK_)];
203     }
204 
205 
206     /**
207      * Sets a 32 bit data in the table data
208      * @param ch codepoint which data is to be set
209      * @param value to set
210      * @return true if the set is successful, otherwise
211      *              if the table has been compacted return false
212      */
setValue(int ch, int value)213     public boolean setValue(int ch, int value)
214     {
215         // valid, uncompacted trie and valid c?
216         if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
217             return false;
218         }
219 
220         int block = getDataBlock(ch);
221         if (block < 0) {
222             return false;
223         }
224 
225         m_data_[block + (ch & MASK_)] = value;
226         return true;
227     }
228 
229     /**
230      * Serializes the build table with 32 bit data
231      * @param datamanipulate builder raw fold method implementation
232      * @param triedatamanipulate result trie fold method
233      * @return a new trie
234      */
serialize(TrieBuilder.DataManipulate datamanipulate, Trie.DataManipulate triedatamanipulate)235     public IntTrie serialize(TrieBuilder.DataManipulate datamanipulate,
236                              Trie.DataManipulate triedatamanipulate)
237     {
238         if (datamanipulate == null) {
239             throw new IllegalArgumentException("Parameters can not be null");
240         }
241         // fold and compact if necessary, also checks that indexLength is
242         // within limits
243         if (!m_isCompacted_) {
244             // compact once without overlap to improve folding
245             compact(false);
246             // fold the supplementary part of the index array
247             fold(datamanipulate);
248             // compact again with overlap for minimum data array length
249             compact(true);
250             m_isCompacted_ = true;
251         }
252         // is dataLength within limits?
253         if (m_dataLength_ >= MAX_DATA_LENGTH_) {
254             throw new ArrayIndexOutOfBoundsException("Data length too small");
255         }
256 
257         char index[] = new char[m_indexLength_];
258         int data[] = new int[m_dataLength_];
259         // write the index (stage 1) array and the 32-bit data (stage 2) array
260         // write 16-bit index values shifted right by INDEX_SHIFT_
261         for (int i = 0; i < m_indexLength_; i ++) {
262             index[i] = (char)(m_index_[i] >>> INDEX_SHIFT_);
263         }
264         // write 32-bit data values
265         System.arraycopy(m_data_, 0, data, 0, m_dataLength_);
266 
267         int options = SHIFT_ | (INDEX_SHIFT_ << OPTIONS_INDEX_SHIFT_);
268         options |= OPTIONS_DATA_IS_32_BIT_;
269         if (m_isLatin1Linear_) {
270             options |= OPTIONS_LATIN1_IS_LINEAR_;
271         }
272         return new IntTrie(index, data, m_initialValue_, options,
273                            triedatamanipulate);
274     }
275 
276 
277     /**
278      * Serializes the build table to an output stream.
279      *
280      * Compacts the build-time trie after all values are set, and then
281      * writes the serialized form onto an output stream.
282      *
283      * After this, this build-time Trie can only be serialized again and/or closed;
284      * no further values can be added.
285      *
286      * This function is the rough equivalent of utrie_seriaize() in ICU4C.
287      *
288      * @param os the output stream to which the seriaized trie will be written.
289      *         If nul, the function still returns the size of the serialized Trie.
290      * @param reduceTo16Bits If true, reduce the data size to 16 bits.  The resulting
291      *         serialized form can then be used to create a CharTrie.
292      * @param datamanipulate builder raw fold method implementation
293      * @return the number of bytes written to the output stream.
294      */
serialize(OutputStream os, boolean reduceTo16Bits, TrieBuilder.DataManipulate datamanipulate)295      public int serialize(OutputStream os, boolean reduceTo16Bits,
296             TrieBuilder.DataManipulate datamanipulate)  throws IOException {
297          if (datamanipulate == null) {
298              throw new IllegalArgumentException("Parameters can not be null");
299          }
300 
301          // fold and compact if necessary, also checks that indexLength is
302          // within limits
303          if (!m_isCompacted_) {
304              // compact once without overlap to improve folding
305              compact(false);
306              // fold the supplementary part of the index array
307              fold(datamanipulate);
308              // compact again with overlap for minimum data array length
309              compact(true);
310              m_isCompacted_ = true;
311          }
312 
313          // is dataLength within limits?
314          int length;
315          if (reduceTo16Bits) {
316              length = m_dataLength_ + m_indexLength_;
317          } else {
318              length = m_dataLength_;
319          }
320          if (length >= MAX_DATA_LENGTH_) {
321              throw new ArrayIndexOutOfBoundsException("Data length too small");
322          }
323 
324          //  struct UTrieHeader {
325          //      int32_t   signature;
326          //      int32_t   options  (a bit field)
327          //      int32_t   indexLength
328          //      int32_t   dataLength
329          length = Trie.HEADER_LENGTH_ + 2*m_indexLength_;
330          if(reduceTo16Bits) {
331              length+=2*m_dataLength_;
332          } else {
333              length+=4*m_dataLength_;
334          }
335 
336          if (os == null) {
337              // No output stream.  Just return the length of the serialized Trie, in bytes.
338              return length;
339          }
340 
341          DataOutputStream dos = new DataOutputStream(os);
342          dos.writeInt(Trie.HEADER_SIGNATURE_);
343 
344          int options = Trie.INDEX_STAGE_1_SHIFT_ | (Trie.INDEX_STAGE_2_SHIFT_<<Trie.HEADER_OPTIONS_INDEX_SHIFT_);
345          if(!reduceTo16Bits) {
346              options |= Trie.HEADER_OPTIONS_DATA_IS_32_BIT_;
347          }
348          if(m_isLatin1Linear_) {
349              options |= Trie.HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_;
350          }
351          dos.writeInt(options);
352 
353          dos.writeInt(m_indexLength_);
354          dos.writeInt(m_dataLength_);
355 
356          /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
357          if(reduceTo16Bits) {
358              /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
359              for (int i=0; i<m_indexLength_; i++) {
360                  int v = (m_index_[i] + m_indexLength_) >>> Trie.INDEX_STAGE_2_SHIFT_;
361                  dos.writeChar(v);
362              }
363 
364              /* write 16-bit data values */
365              for(int i=0; i<m_dataLength_; i++) {
366                  int v = m_data_[i] & 0x0000ffff;
367                  dos.writeChar(v);
368              }
369          } else {
370              /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
371              for (int i=0; i<m_indexLength_; i++) {
372                  int v = (m_index_[i]) >>> Trie.INDEX_STAGE_2_SHIFT_;
373                  dos.writeChar(v);
374              }
375 
376              /* write 32-bit data values */
377              for(int i=0; i<m_dataLength_; i++) {
378                  dos.writeInt(m_data_[i]);
379              }
380         }
381 
382          return length;
383 
384     }
385 
386 
387     /**
388      * Set a value in a range of code points [start..limit].
389      * All code points c with start &lt;= c &lt; limit will get the value if
390      * overwrite is true or if the old value is 0.
391      * @param start the first code point to get the value
392      * @param limit one past the last code point to get the value
393      * @param value the value
394      * @param overwrite flag for whether old non-initial values are to be
395      *        overwritten
396      * @return false if a failure occurred (illegal argument or data array
397      *               overrun)
398      */
setRange(int start, int limit, int value, boolean overwrite)399     public boolean setRange(int start, int limit, int value,
400                             boolean overwrite)
401     {
402         // repeat value in [start..limit[
403         // mark index values for repeat-data blocks by setting bit 31 of the
404         // index values fill around existing values if any, if(overwrite)
405 
406         // valid, uncompacted trie and valid indexes?
407         if (m_isCompacted_ || start < UCharacter.MIN_VALUE
408             || start > UCharacter.MAX_VALUE || limit < UCharacter.MIN_VALUE
409             || limit > (UCharacter.MAX_VALUE + 1) || start > limit) {
410             return false;
411         }
412 
413         if (start == limit) {
414             return true; // nothing to do
415         }
416 
417         if ((start & MASK_) != 0) {
418             // set partial block at [start..following block boundary[
419             int block = getDataBlock(start);
420             if (block < 0) {
421                 return false;
422             }
423 
424             int nextStart = (start + DATA_BLOCK_LENGTH) & ~MASK_;
425             if (nextStart <= limit) {
426                 fillBlock(block, start & MASK_, DATA_BLOCK_LENGTH,
427                           value, overwrite);
428                 start = nextStart;
429             }
430             else {
431                 fillBlock(block, start & MASK_, limit & MASK_,
432                           value, overwrite);
433                 return true;
434             }
435         }
436 
437         // number of positions in the last, partial block
438         int rest = limit & MASK_;
439 
440         // round down limit to a block boundary
441         limit &= ~MASK_;
442 
443         // iterate over all-value blocks
444         int repeatBlock = 0;
445         if (value == m_initialValue_) {
446             // repeatBlock = 0; assigned above
447         }
448         else {
449             repeatBlock = -1;
450         }
451         while (start < limit) {
452             // get index value
453             int block = m_index_[start >> SHIFT_];
454             if (block > 0) {
455                 // already allocated, fill in value
456                 fillBlock(block, 0, DATA_BLOCK_LENGTH, value, overwrite);
457             }
458             else if (m_data_[-block] != value && (block == 0 || overwrite)) {
459                 // set the repeatBlock instead of the current block 0 or range
460                 // block
461                 if (repeatBlock >= 0) {
462                     m_index_[start >> SHIFT_] = -repeatBlock;
463                 }
464                 else {
465                     // create and set and fill the repeatBlock
466                     repeatBlock = getDataBlock(start);
467                     if (repeatBlock < 0) {
468                         return false;
469                     }
470 
471                     // set the negative block number to indicate that it is a
472                     // repeat block
473                     m_index_[start >> SHIFT_] = -repeatBlock;
474                     fillBlock(repeatBlock, 0, DATA_BLOCK_LENGTH, value, true);
475                 }
476             }
477 
478             start += DATA_BLOCK_LENGTH;
479         }
480 
481         if (rest > 0) {
482             // set partial block at [last block boundary..limit[
483             int block = getDataBlock(start);
484             if (block < 0) {
485                 return false;
486             }
487             fillBlock(block, 0, rest, value, overwrite);
488         }
489 
490         return true;
491     }
492 
493     // protected data member ------------------------------------------------
494 
495     protected int m_data_[];
496     protected int m_initialValue_;
497 
498     //  private data member ------------------------------------------------
499 
500     private int m_leadUnitValue_;
501 
502     // private methods ------------------------------------------------------
503 
allocDataBlock()504     private int allocDataBlock()
505     {
506         int newBlock = m_dataLength_;
507         int newTop = newBlock + DATA_BLOCK_LENGTH;
508         if (newTop > m_dataCapacity_) {
509             // out of memory in the data array
510             return -1;
511         }
512         m_dataLength_ = newTop;
513         return newBlock;
514     }
515 
516     /**
517      * No error checking for illegal arguments.
518      * @param ch codepoint to look for
519      * @return -1 if no new data block available (out of memory in data array)
520      */
getDataBlock(int ch)521     private int getDataBlock(int ch)
522     {
523         ch >>= SHIFT_;
524         int indexValue = m_index_[ch];
525         if (indexValue > 0) {
526             return indexValue;
527         }
528 
529         // allocate a new data block
530         int newBlock = allocDataBlock();
531         if (newBlock < 0) {
532             // out of memory in the data array
533             return -1;
534         }
535         m_index_[ch] = newBlock;
536 
537         // copy-on-write for a block from a setRange()
538         System.arraycopy(m_data_, Math.abs(indexValue), m_data_, newBlock,
539                          DATA_BLOCK_LENGTH << 2);
540         return newBlock;
541     }
542 
543     /**
544      * Compact a folded build-time trie.
545      * The compaction
546      * - removes blocks that are identical with earlier ones
547      * - overlaps adjacent blocks as much as possible (if overlap == true)
548      * - moves blocks in steps of the data granularity
549      * - moves and overlaps blocks that overlap with multiple values in the overlap region
550      *
551      * It does not
552      * - try to move and overlap blocks that are not already adjacent
553      * @param overlap flag
554      */
compact(boolean overlap)555     private void compact(boolean overlap)
556     {
557         if (m_isCompacted_) {
558             return; // nothing left to do
559         }
560 
561         // compaction
562         // initialize the index map with "block is used/unused" flags
563         findUnusedBlocks();
564 
565         // if Latin-1 is preallocated and linear, then do not compact Latin-1
566         // data
567         int overlapStart = DATA_BLOCK_LENGTH;
568         if (m_isLatin1Linear_ && SHIFT_ <= 8) {
569             overlapStart += 256;
570         }
571 
572         int newStart = DATA_BLOCK_LENGTH;
573         int i;
574         for (int start = newStart; start < m_dataLength_;) {
575             // start: index of first entry of current block
576             // newStart: index where the current block is to be moved
577             //           (right after current end of already-compacted data)
578             // skip blocks that are not used
579             if (m_map_[start >>> SHIFT_] < 0) {
580                 // advance start to the next block
581                 start += DATA_BLOCK_LENGTH;
582                 // leave newStart with the previous block!
583                 continue;
584             }
585             // search for an identical block
586             if (start >= overlapStart) {
587                 i = findSameDataBlock(m_data_, newStart, start,
588                                           overlap ? DATA_GRANULARITY_ : DATA_BLOCK_LENGTH);
589                 if (i >= 0) {
590                     // found an identical block, set the other block's index
591                     // value for the current block
592                     m_map_[start >>> SHIFT_] = i;
593                     // advance start to the next block
594                     start += DATA_BLOCK_LENGTH;
595                     // leave newStart with the previous block!
596                     continue;
597                 }
598             }
599             // see if the beginning of this block can be overlapped with the
600             // end of the previous block
601             if(overlap && start>=overlapStart) {
602                 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
603                 for(i=DATA_BLOCK_LENGTH-DATA_GRANULARITY_;
604                     i>0 && !equal_int(m_data_, newStart-i, start, i);
605                     i-=DATA_GRANULARITY_) {}
606             } else {
607                 i=0;
608             }
609             if (i > 0) {
610                 // some overlap
611                 m_map_[start >>> SHIFT_] = newStart - i;
612                 // move the non-overlapping indexes to their new positions
613                 start += i;
614                 for (i = DATA_BLOCK_LENGTH - i; i > 0; -- i) {
615                     m_data_[newStart ++] = m_data_[start ++];
616                 }
617             }
618             else if (newStart < start) {
619                 // no overlap, just move the indexes to their new positions
620                 m_map_[start >>> SHIFT_] = newStart;
621                 for (i = DATA_BLOCK_LENGTH; i > 0; -- i) {
622                     m_data_[newStart ++] = m_data_[start ++];
623                 }
624             }
625             else { // no overlap && newStart==start
626                 m_map_[start >>> SHIFT_] = start;
627                 newStart += DATA_BLOCK_LENGTH;
628                 start = newStart;
629             }
630         }
631         // now adjust the index (stage 1) table
632         for (i = 0; i < m_indexLength_; ++ i) {
633             m_index_[i] = m_map_[Math.abs(m_index_[i]) >>> SHIFT_];
634         }
635         m_dataLength_ = newStart;
636     }
637 
638     /**
639      * Find the same data block
640      * @param data array
641      * @param dataLength
642      * @param otherBlock
643      * @param step
644      */
findSameDataBlock(int data[], int dataLength, int otherBlock, int step)645     private static final int findSameDataBlock(int data[], int dataLength,
646                                                int otherBlock, int step)
647     {
648         // ensure that we do not even partially get past dataLength
649         dataLength -= DATA_BLOCK_LENGTH;
650 
651         for (int block = 0; block <= dataLength; block += step) {
652             if(equal_int(data, block, otherBlock, DATA_BLOCK_LENGTH)) {
653                 return block;
654             }
655         }
656         return -1;
657     }
658 
659     /**
660      * Fold the normalization data for supplementary code points into
661      * a compact area on top of the BMP-part of the trie index,
662      * with the lead surrogates indexing this compact area.
663      *
664      * Duplicate the index values for lead surrogates:
665      * From inside the BMP area, where some may be overridden with folded values,
666      * to just after the BMP area, where they can be retrieved for
667      * code point lookups.
668      * @param manipulate fold implementation
669      */
fold(DataManipulate manipulate)670     private final void fold(DataManipulate manipulate)
671     {
672         int leadIndexes[] = new int[SURROGATE_BLOCK_COUNT_];
673         int index[] = m_index_;
674         // copy the lead surrogate indexes into a temporary array
675         System.arraycopy(index, 0xd800 >> SHIFT_, leadIndexes, 0,
676                          SURROGATE_BLOCK_COUNT_);
677 
678         // set all values for lead surrogate code *units* to leadUnitValue
679         // so that by default runtime lookups will find no data for associated
680         // supplementary code points, unless there is data for such code points
681         // which will result in a non-zero folding value below that is set for
682         // the respective lead units
683         // the above saved the indexes for surrogate code *points*
684         // fill the indexes with simplified code from utrie_setRange32()
685         int block = 0;
686         if (m_leadUnitValue_ == m_initialValue_) {
687             // leadUnitValue == initialValue, use all-initial-value block
688             // block = 0; if block here left empty
689         }
690         else {
691             // create and fill the repeatBlock
692             block = allocDataBlock();
693             if (block < 0) {
694                 // data table overflow
695                 throw new IllegalStateException("Internal error: Out of memory space");
696             }
697             fillBlock(block, 0, DATA_BLOCK_LENGTH, m_leadUnitValue_, true);
698             // negative block number to indicate that it is a repeat block
699             block = -block;
700         }
701         for (int c = (0xd800 >> SHIFT_); c < (0xdc00 >> SHIFT_); ++ c) {
702             m_index_[c] = block;
703         }
704 
705         // Fold significant index values into the area just after the BMP
706         // indexes.
707         // In case the first lead surrogate has significant data,
708         // its index block must be used first (in which case the folding is a
709         // no-op).
710         // Later all folded index blocks are moved up one to insert the copied
711         // lead surrogate indexes.
712         int indexLength = BMP_INDEX_LENGTH_;
713         // search for any index (stage 1) entries for supplementary code points
714         for (int c = 0x10000; c < 0x110000;) {
715             if (index[c >> SHIFT_] != 0) {
716                 // there is data, treat the full block for a lead surrogate
717                 c &= ~0x3ff;
718                 // is there an identical index block?
719                 block = findSameIndexBlock(index, indexLength, c >> SHIFT_);
720 
721                 // get a folded value for [c..c+0x400[ and,
722                 // if different from the value for the lead surrogate code
723                 // point, set it for the lead surrogate code unit
724 
725                 int value = manipulate.getFoldedValue(c,
726                                                       block + SURROGATE_BLOCK_COUNT_);
727                 if (value != getValue(UTF16.getLeadSurrogate(c))) {
728                     if (!setValue(UTF16.getLeadSurrogate(c), value)) {
729                         // data table overflow
730                         throw new ArrayIndexOutOfBoundsException(
731                                                                  "Data table overflow");
732                     }
733                     // if we did not find an identical index block...
734                     if (block == indexLength) {
735                         // move the actual index (stage 1) entries from the
736                         // supplementary position to the new one
737                         System.arraycopy(index, c >> SHIFT_, index, indexLength,
738                                          SURROGATE_BLOCK_COUNT_);
739                         indexLength += SURROGATE_BLOCK_COUNT_;
740                     }
741                 }
742                 c += 0x400;
743             }
744             else {
745                 c += DATA_BLOCK_LENGTH;
746             }
747         }
748 
749         // index array overflow?
750         // This is to guarantee that a folding offset is of the form
751         // UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
752         // If the index is too large, then n>=1024 and more than 10 bits are
753         // necessary.
754         // In fact, it can only ever become n==1024 with completely unfoldable
755         // data and the additional block of duplicated values for lead
756         // surrogates.
757         if (indexLength >= MAX_INDEX_LENGTH_) {
758             throw new ArrayIndexOutOfBoundsException("Index table overflow");
759         }
760         // make space for the lead surrogate index block and insert it between
761         // the BMP indexes and the folded ones
762         System.arraycopy(index, BMP_INDEX_LENGTH_, index,
763                          BMP_INDEX_LENGTH_ + SURROGATE_BLOCK_COUNT_,
764                          indexLength - BMP_INDEX_LENGTH_);
765         System.arraycopy(leadIndexes, 0, index, BMP_INDEX_LENGTH_,
766                          SURROGATE_BLOCK_COUNT_);
767         indexLength += SURROGATE_BLOCK_COUNT_;
768         m_indexLength_ = indexLength;
769     }
770 
771     /**
772      * @internal
773      */
fillBlock(int block, int start, int limit, int value, boolean overwrite)774     private void fillBlock(int block, int start, int limit, int value,
775                            boolean overwrite)
776     {
777         limit += block;
778         block += start;
779         if (overwrite) {
780             while (block < limit) {
781                 m_data_[block ++] = value;
782             }
783         }
784         else {
785             while (block < limit) {
786                 if (m_data_[block] == m_initialValue_) {
787                     m_data_[block] = value;
788                 }
789                 ++ block;
790             }
791         }
792     }
793 }
794 
795