• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *******************************************************************************
3  * Copyright (C) 2006-2015, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 
8 package com.ibm.icu.charset;
9 
10 import java.io.IOException;
11 import java.nio.ByteBuffer;
12 
13 import com.ibm.icu.impl.ICUBinary;
14 
15 
16 /* Format of cnvalias.icu -----------------------------------------------------
17  *
18  * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
19  * This binary form contains several tables. All indexes are to uint16_t
20  * units, and not to the bytes (uint8_t units). Addressing everything on
21  * 16-bit boundaries allows us to store more information with small index
22  * numbers, which are also 16-bit in size. The majority of the table (except
23  * the string table) are 16-bit numbers.
24  *
25  * First there is the size of the Table of Contents (TOC). The TOC
26  * entries contain the size of each section. In order to find the offset
27  * you just need to sum up the previous offsets.
28  * The TOC length and entries are an array of uint32_t values.
29  * The first section after the TOC starts immediately after the TOC.
30  *
31  * 1) This section contains a list of converters. This list contains indexes
32  * into the string table for the converter name. The index of this list is
33  * also used by other sections, which are mentioned later on.
34  * This list is not sorted.
35  *
36  * 2) This section contains a list of tags. This list contains indexes
37  * into the string table for the tag name. The index of this list is
38  * also used by other sections, which are mentioned later on.
39  * This list is in priority order of standards.
40  *
41  * 3) This section contains a list of sorted unique aliases. This
42  * list contains indexes into the string table for the alias name. The
43  * index of this list is also used by other sections, like the 4th section.
44  * The index for the 3rd and 4th section is used to get the
45  * alias -> converter name mapping. Section 3 and 4 form a two column table.
46  *
47  * 4) This section contains a list of mapped converter names. Consider this
48  * as a table that maps the 3rd section to the 1st section. This list contains
49  * indexes into the 1st section. The index of this list is the same index in
50  * the 3rd section. There is also some extra information in the high bits of
51  * each converter index in this table. Currently it's only used to say that
52  * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
53  * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
54  * the predigested form of the 5th section so that an alias lookup can be fast.
55  *
56  * 5) This section contains a 2D array with indexes to the 6th section. This
57  * section is the full form of all alias mappings. The column index is the
58  * index into the converter list (column header). The row index is the index
59  * to tag list (row header). This 2D array is the top part a 3D array. The
60  * third dimension is in the 6th section.
61  *
62  * 6) This is blob of variable length arrays. Each array starts with a size,
63  * and is followed by indexes to alias names in the string table. This is
64  * the third dimension to the section 5. No other section should be referencing
65  * this section.
66  *
67  * 7) Reserved at this time (There is no information). This _usually_ has a
68  * size of 0. Future versions may add more information here.
69  *
70  * 8) This is the string table. All strings are indexed on an even address.
71  * There are two reasons for this. First many chip architectures locate strings
72  * faster on even address boundaries. Second, since all indexes are 16-bit
73  * numbers, this string table can be 128KB in size instead of 64KB when we
74  * only have strings starting on an even address.
75  *
76  *
77  * Here is the concept of section 5 and 6. It's a 3D cube. Each tag
78  * has a unique alias among all converters. That same alias can
79  * be mentioned in other standards on different converters,
80  * but only one alias per tag can be unique.
81  *
82  *
83  *              Converter Names (Usually in TR22 form)
84  *           -------------------------------------------.
85  *     T    /                                          /|
86  *     a   /                                          / |
87  *     g  /                                          /  |
88  *     s /                                          /   |
89  *      /                                          /    |
90  *      ------------------------------------------/     |
91  *    A |                                         |     |
92  *    l |                                         |     |
93  *    i |                                         |    /
94  *    a |                                         |   /
95  *    s |                                         |  /
96  *    e |                                         | /
97  *    s |                                         |/
98  *      -------------------------------------------
99  *
100  *
101  *
102  * Here is what it really looks like. It's like swiss cheese.
103  * There are holes. Some converters aren't recognized by
104  * a standard, or they are really old converters that the
105  * standard doesn't recognize anymore.
106  *
107  *              Converter Names (Usually in TR22 form)
108  *           -------------------------------------------.
109  *     T    /##########################################/|
110  *     a   /     #            #                       /#
111  *     g  /  #      ##     ##     ### # ### ### ### #/
112  *     s / #             #####  ####        ##  ## #/#
113  *      / ### # # ##  #  #   #          ### # #   #/##
114  *      ------------------------------------------/# #
115  *    A |### # # ##  #  #   #          ### # #   #|# #
116  *    l |# # #    #     #               ## #     #|# #
117  *    i |# # #    #     #                #       #|#
118  *    a |#                                       #|#
119  *    s |                                        #|#
120  *    e
121  *    s
122  *
123  */
124 
125 final class UConverterAliasDataReader implements ICUBinary.Authenticate {
126 //    private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader");
127 
128    /**
129     * <p>Protected constructor.</p>
130     * @param bytes ICU uprop.dat file buffer
131     * @exception IOException throw if data file fails authentication
132     */
UConverterAliasDataReader(ByteBuffer bytes)133     protected UConverterAliasDataReader(ByteBuffer bytes)
134                                         throws IOException{
135         //if(debug) System.out.println("Bytes in buffer " + bytes.remaining());
136 
137         byteBuffer = bytes;
138         /*unicodeVersion = */ICUBinary.readHeader(byteBuffer, DATA_FORMAT_ID, this);
139 
140         //if(debug) System.out.println("Bytes left in byteBuffer " + byteBuffer.remaining());
141     }
142 
143     // protected methods -------------------------------------------------
144 
readToc(int n)145     protected int[] readToc(int n)throws IOException
146     {
147         //Read the toc
148         return ICUBinary.getInts(byteBuffer, n, 0);
149     }
150 
isDataVersionAcceptable(byte version[])151     public boolean isDataVersionAcceptable(byte version[])
152     {
153         return version.length >= DATA_FORMAT_VERSION.length
154             && version[0] == DATA_FORMAT_VERSION[0]
155             && version[1] == DATA_FORMAT_VERSION[1]
156             && version[2] == DATA_FORMAT_VERSION[2];
157     }
158 
159     /*byte[] getUnicodeVersion(){
160         return ICUBinary.getVersionByteArrayFromCompactInt(unicodeVersion);
161     }*/
162     // private data members -------------------------------------------------
163 
164 
165     /**
166     * ICU data file buffer
167     */
168     private ByteBuffer byteBuffer;
169 
170 //    private int unicodeVersion;
171 
172     /**
173     * File format version that this class understands.
174     * No guarantees are made if a older version is used
175     * see store.c of gennorm for more information and values
176     */
177         // DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
178     private static final int DATA_FORMAT_ID = 0x4376416c; // dataFormat="CvAl"
179     private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1};
180 }
181