• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  * Copyright (C) 2006-2015, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package com.ibm.icu.charset;
11 
12 import java.io.IOException;
13 import java.nio.ByteBuffer;
14 
15 import com.ibm.icu.impl.ICUBinary;
16 
17 
18 /* Format of cnvalias.icu -----------------------------------------------------
19  *
20  * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
21  * This binary form contains several tables. All indexes are to uint16_t
22  * units, and not to the bytes (uint8_t units). Addressing everything on
23  * 16-bit boundaries allows us to store more information with small index
24  * numbers, which are also 16-bit in size. The majority of the table (except
25  * the string table) are 16-bit numbers.
26  *
27  * First there is the size of the Table of Contents (TOC). The TOC
28  * entries contain the size of each section. In order to find the offset
29  * you just need to sum up the previous offsets.
30  * The TOC length and entries are an array of uint32_t values.
31  * The first section after the TOC starts immediately after the TOC.
32  *
33  * 1) This section contains a list of converters. This list contains indexes
34  * into the string table for the converter name. The index of this list is
35  * also used by other sections, which are mentioned later on.
36  * This list is not sorted.
37  *
38  * 2) This section contains a list of tags. This list contains indexes
39  * into the string table for the tag name. The index of this list is
40  * also used by other sections, which are mentioned later on.
41  * This list is in priority order of standards.
42  *
43  * 3) This section contains a list of sorted unique aliases. This
44  * list contains indexes into the string table for the alias name. The
45  * index of this list is also used by other sections, like the 4th section.
46  * The index for the 3rd and 4th section is used to get the
47  * alias -> converter name mapping. Section 3 and 4 form a two column table.
48  *
49  * 4) This section contains a list of mapped converter names. Consider this
50  * as a table that maps the 3rd section to the 1st section. This list contains
51  * indexes into the 1st section. The index of this list is the same index in
52  * the 3rd section. There is also some extra information in the high bits of
53  * each converter index in this table. Currently it's only used to say that
54  * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
55  * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
56  * the predigested form of the 5th section so that an alias lookup can be fast.
57  *
58  * 5) This section contains a 2D array with indexes to the 6th section. This
59  * section is the full form of all alias mappings. The column index is the
60  * index into the converter list (column header). The row index is the index
61  * to tag list (row header). This 2D array is the top part a 3D array. The
62  * third dimension is in the 6th section.
63  *
64  * 6) This is blob of variable length arrays. Each array starts with a size,
65  * and is followed by indexes to alias names in the string table. This is
66  * the third dimension to the section 5. No other section should be referencing
67  * this section.
68  *
69  * 7) Reserved at this time (There is no information). This _usually_ has a
70  * size of 0. Future versions may add more information here.
71  *
72  * 8) This is the string table. All strings are indexed on an even address.
73  * There are two reasons for this. First many chip architectures locate strings
74  * faster on even address boundaries. Second, since all indexes are 16-bit
75  * numbers, this string table can be 128KB in size instead of 64KB when we
76  * only have strings starting on an even address.
77  *
78  *
79  * Here is the concept of section 5 and 6. It's a 3D cube. Each tag
80  * has a unique alias among all converters. That same alias can
81  * be mentioned in other standards on different converters,
82  * but only one alias per tag can be unique.
83  *
84  *
85  *              Converter Names (Usually in TR22 form)
86  *           -------------------------------------------.
87  *     T    /                                          /|
88  *     a   /                                          / |
89  *     g  /                                          /  |
90  *     s /                                          /   |
91  *      /                                          /    |
92  *      ------------------------------------------/     |
93  *    A |                                         |     |
94  *    l |                                         |     |
95  *    i |                                         |    /
96  *    a |                                         |   /
97  *    s |                                         |  /
98  *    e |                                         | /
99  *    s |                                         |/
100  *      -------------------------------------------
101  *
102  *
103  *
104  * Here is what it really looks like. It's like swiss cheese.
105  * There are holes. Some converters aren't recognized by
106  * a standard, or they are really old converters that the
107  * standard doesn't recognize anymore.
108  *
109  *              Converter Names (Usually in TR22 form)
110  *           -------------------------------------------.
111  *     T    /##########################################/|
112  *     a   /     #            #                       /#
113  *     g  /  #      ##     ##     ### # ### ### ### #/
114  *     s / #             #####  ####        ##  ## #/#
115  *      / ### # # ##  #  #   #          ### # #   #/##
116  *      ------------------------------------------/# #
117  *    A |### # # ##  #  #   #          ### # #   #|# #
118  *    l |# # #    #     #               ## #     #|# #
119  *    i |# # #    #     #                #       #|#
120  *    a |#                                       #|#
121  *    s |                                        #|#
122  *    e
123  *    s
124  *
125  */
126 
127 final class UConverterAliasDataReader implements ICUBinary.Authenticate {
128 //    private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader");
129 
130    /**
131     * <p>Protected constructor.</p>
132     * @param bytes ICU uprop.dat file buffer
133     * @exception IOException throw if data file fails authentication
134     */
UConverterAliasDataReader(ByteBuffer bytes)135     protected UConverterAliasDataReader(ByteBuffer bytes)
136                                         throws IOException{
137         //if(debug) System.out.println("Bytes in buffer " + bytes.remaining());
138 
139         byteBuffer = bytes;
140         /*unicodeVersion = */ICUBinary.readHeader(byteBuffer, DATA_FORMAT_ID, this);
141 
142         //if(debug) System.out.println("Bytes left in byteBuffer " + byteBuffer.remaining());
143     }
144 
145     // protected methods -------------------------------------------------
146 
readToc(int n)147     protected int[] readToc(int n)throws IOException
148     {
149         //Read the toc
150         return ICUBinary.getInts(byteBuffer, n, 0);
151     }
152 
153     @Override
isDataVersionAcceptable(byte version[])154     public boolean isDataVersionAcceptable(byte version[])
155     {
156         return version.length >= DATA_FORMAT_VERSION.length
157             && version[0] == DATA_FORMAT_VERSION[0]
158             && version[1] == DATA_FORMAT_VERSION[1]
159             && version[2] == DATA_FORMAT_VERSION[2];
160     }
161 
162     /*byte[] getUnicodeVersion(){
163         return ICUBinary.getVersionByteArrayFromCompactInt(unicodeVersion);
164     }*/
165     // private data members -------------------------------------------------
166 
167 
168     /**
169     * ICU data file buffer
170     */
171     private ByteBuffer byteBuffer;
172 
173 //    private int unicodeVersion;
174 
175     /**
176     * File format version that this class understands.
177     * No guarantees are made if a older version is used
178     * see store.c of gennorm for more information and values
179     */
180         // DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
181     private static final int DATA_FORMAT_ID = 0x4376416c; // dataFormat="CvAl"
182     private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1};
183 }
184