/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * $Id: EncodingInfo.java 468654 2006-10-28 07:09:23Z minchau $ */ package org.apache.xml.serializer; /** * Holds information about a given encoding, which is the Java name for the * encoding, the equivalent ISO name. *
* An object of this type has two useful methods *
* isInEncoding(char ch); ** which can be called if the character is not the high one in * a surrogate pair and: *
* isInEncoding(char high, char low); ** which can be called if the two characters from a high/low surrogate pair. *
* An EncodingInfo object is a node in a binary search tree. Such a node
* will answer if a character is in the encoding, and do so for a given
* range of unicode values (m_first
to
* m_last
). It will handle a certain range of values
* explicitly (m_explFirst
to m_explLast
).
* If the unicode point is before that explicit range, that is it
* is in the range m_first <= value < m_explFirst
, then it will delegate to another EncodingInfo object for The root
* of such a tree, m_before. Likewise for values in the range
* m_explLast < value <= m_last
, but delgating to m_after
*
* Actually figuring out if a code point is in the encoding is expensive. So the * purpose of this tree is to cache such determinations, and not to build the * entire tree of information at the start, but only build up as much of the * tree as is used during the transformation. *
* This Class is not a public API, and should only be used internally within * the serializer. *
* This class is not a public API. * @xsl.usage internal */ public final class EncodingInfo extends Object { /** * Not all characters in an encoding are in on contiguous group, * however there is a lowest contiguous group starting at '\u0001' * and working up to m_highCharInContiguousGroup. *
* This is the char for which chars at or below this value are * definately in the encoding, although for chars * above this point they might be in the encoding. * This exists for performance, especially for ASCII characters * because for ASCII all chars in the range '\u0001' to '\u007F' * are in the encoding. * */ private final char m_highCharInContiguousGroup; /** * The ISO encoding name. */ final String name; /** * The name used by the Java convertor. */ final String javaName; /** * A helper object that we can ask if a * single char, or a surrogate UTF-16 pair * of chars that form a single character, * is in this encoding. */ private InEncoding m_encoding; /** * This is not a public API. It returns true if the * char in question is in the encoding. * @param ch the char in question. *
* This method is not a public API. * @xsl.usage internal */ public boolean isInEncoding(char ch) { if (m_encoding == null) { m_encoding = new EncodingImpl(); // One could put alternate logic in here to // instantiate another object that implements the // InEncoding interface. For example if the JRE is 1.4 or up // we could have an object that uses JRE 1.4 methods } return m_encoding.isInEncoding(ch); } /** * This is not a public API. It returns true if the * character formed by the high/low pair is in the encoding. * @param high a char that the a high char of a high/low surrogate pair. * @param low a char that is the low char of a high/low surrogate pair. *
* This method is not a public API. * @xsl.usage internal */ public boolean isInEncoding(char high, char low) { if (m_encoding == null) { m_encoding = new EncodingImpl(); // One could put alternate logic in here to // instantiate another object that implements the // InEncoding interface. For example if the JRE is 1.4 or up // we could have an object that uses JRE 1.4 methods } return m_encoding.isInEncoding(high, low); } /** * Create an EncodingInfo object based on the ISO name and Java name. * If both parameters are null any character will be considered to * be in the encoding. This is useful for when the serializer is in * temporary output state, and has no assciated encoding. * * @param name reference to the ISO name. * @param javaName reference to the Java encoding name. * @param highChar The char for which characters at or below this value are * definately in the * encoding, although for characters above this point they might be in the encoding. */ public EncodingInfo(String name, String javaName, char highChar) { this.name = name; this.javaName = javaName; this.m_highCharInContiguousGroup = highChar; } /** * A simple interface to isolate the implementation. * We could also use some new JRE 1.4 methods in another implementation * provided we use reflection with them. *
* This interface is not a public API, * and should only be used internally within the serializer. * @xsl.usage internal */ private interface InEncoding { /** * Returns true if the char is in the encoding */ public boolean isInEncoding(char ch); /** * Returns true if the high/low surrogate pair forms * a character that is in the encoding. */ public boolean isInEncoding(char high, char low); } /** * This class implements the */ private class EncodingImpl implements InEncoding { public boolean isInEncoding(char ch1) { final boolean ret; int codePoint = Encodings.toCodePoint(ch1); if (codePoint < m_explFirst) { // The unicode value is before the range // that we explictly manage, so we delegate the answer. // If we don't have an m_before object to delegate to, make one. if (m_before == null) m_before = new EncodingImpl( m_encoding, m_first, m_explFirst - 1, codePoint); ret = m_before.isInEncoding(ch1); } else if (m_explLast < codePoint) { // The unicode value is after the range // that we explictly manage, so we delegate the answer. // If we don't have an m_after object to delegate to, make one. if (m_after == null) m_after = new EncodingImpl( m_encoding, m_explLast + 1, m_last, codePoint); ret = m_after.isInEncoding(ch1); } else { // The unicode value is in the range we explitly handle final int idx = codePoint - m_explFirst; // If we already know the answer, just return it. if (m_alreadyKnown[idx]) ret = m_isInEncoding[idx]; else { // We don't know the answer, so find out, // which may be expensive, then cache the answer ret = inEncoding(ch1, m_encoding); m_alreadyKnown[idx] = true; m_isInEncoding[idx] = ret; } } return ret; } public boolean isInEncoding(char high, char low) { final boolean ret; int codePoint = Encodings.toCodePoint(high,low); if (codePoint < m_explFirst) { // The unicode value is before the range // that we explictly manage, so we delegate the answer. // If we don't have an m_before object to delegate to, make one. if (m_before == null) m_before = new EncodingImpl( m_encoding, m_first, m_explFirst - 1, codePoint); ret = m_before.isInEncoding(high,low); } else if (m_explLast < codePoint) { // The unicode value is after the range // that we explictly manage, so we delegate the answer. // If we don't have an m_after object to delegate to, make one. if (m_after == null) m_after = new EncodingImpl( m_encoding, m_explLast + 1, m_last, codePoint); ret = m_after.isInEncoding(high,low); } else { // The unicode value is in the range we explitly handle final int idx = codePoint - m_explFirst; // If we already know the answer, just return it. if (m_alreadyKnown[idx]) ret = m_isInEncoding[idx]; else { // We don't know the answer, so find out, // which may be expensive, then cache the answer ret = inEncoding(high, low, m_encoding); m_alreadyKnown[idx] = true; m_isInEncoding[idx] = ret; } } return ret; } /** * The encoding. */ final private String m_encoding; /** * m_first through m_last is the range of unicode * values that this object will return an answer on. * It may delegate to a similar object with a different * range */ final private int m_first; /** * m_explFirst through m_explLast is the range of unicode * value that this object handles explicitly and does not * delegate to a similar object. */ final private int m_explFirst; final private int m_explLast; final private int m_last; /** * The object, of the same type as this one, * that handles unicode values in a range before * the range explictly handled by this object, and * to which this object may delegate. */ private InEncoding m_before; /** * The object, of the same type as this one, * that handles unicode values in a range after * the range explictly handled by this object, and * to which this object may delegate. */ private InEncoding m_after; /** * The number of unicode values explicitly handled * by a single EncodingInfo object. This value is * tuneable, but is set to 128 because that covers the * entire low range of ASCII type chars within a single * object. */ private static final int RANGE = 128; /** * A flag to record if we already know the answer * for the given unicode value. */ final private boolean m_alreadyKnown[] = new boolean[RANGE]; /** * A table holding the answer on whether the given unicode * value is in the encoding. */ final private boolean m_isInEncoding[] = new boolean[RANGE]; private EncodingImpl() { // This object will answer whether any unicode value // is in the encoding, it handles values 0 through Integer.MAX_VALUE this(javaName, 0, Integer.MAX_VALUE, (char) 0); } private EncodingImpl(String encoding, int first, int last, int codePoint) { // Set the range of unicode values that this object manages // either explicitly or implicitly. m_first = first; m_last = last; // Set the range of unicode values that this object // explicitly manages m_explFirst = codePoint; m_explLast = codePoint + (RANGE-1); m_encoding = encoding; if (javaName != null) { // Some optimization. if (0 <= m_explFirst && m_explFirst <= 127) { // This particular EncodingImpl explicitly handles // characters in the low range. if ("UTF8".equals(javaName) || "UTF-16".equals(javaName) || "ASCII".equals(javaName) || "US-ASCII".equals(javaName) || "Unicode".equals(javaName) || "UNICODE".equals(javaName) || javaName.startsWith("ISO8859")) { // Not only does this EncodingImpl object explicitly // handle chracters in the low range, it is // also one that we know something about, without // needing to call inEncoding(char ch, String encoding) // for this low range // // By initializing the table ahead of time // for these low values, we prevent the expensive // inEncoding(char ch, String encoding) // from being called, at least for these common // encodings. for (int unicode = 1; unicode < 127; unicode++) { final int idx = unicode - m_explFirst; if (0 <= idx && idx < RANGE) { m_alreadyKnown[idx] = true; m_isInEncoding[idx] = true; } } } } /* A little bit more than optimization. * * We will say that any character is in the encoding if * we don't have an encoding. * This is meaningful when the serializer is being used * in temporary output state, where we are not writing to * the final output tree. It is when writing to the * final output tree that we need to worry about the output * encoding */ if (javaName == null) { for (int idx = 0; idx < m_alreadyKnown.length; idx++) { m_alreadyKnown[idx] = true; m_isInEncoding[idx] = true; } } } } } /** * This is heart of the code that determines if a given character * is in the given encoding. This method is probably expensive, * and the answer should be cached. *
* This method is not a public API, * and should only be used internally within the serializer. * @param ch the char in question, that is not a high char of * a high/low surrogate pair. * @param encoding the Java name of the enocding. * * @xsl.usage internal * */ private static boolean inEncoding(char ch, String encoding) { boolean isInEncoding; try { char cArray[] = new char[1]; cArray[0] = ch; // Construct a String from the char String s = new String(cArray); // Encode the String into a sequence of bytes // using the given, named charset. byte[] bArray = s.getBytes(encoding); isInEncoding = inEncoding(ch, bArray); } catch (Exception e) { isInEncoding = false; // If for some reason the encoding is null, e.g. // for a temporary result tree, we should just // say that every character is in the encoding. if (encoding == null) isInEncoding = true; } return isInEncoding; } /** * This is heart of the code that determines if a given high/low * surrogate pair forms a character that is in the given encoding. * This method is probably expensive, and the answer should be cached. *
* This method is not a public API, * and should only be used internally within the serializer. * @param high the high char of * a high/low surrogate pair. * @param low the low char of a high/low surrogate pair. * @param encoding the Java name of the encoding. * * @xsl.usage internal * */ private static boolean inEncoding(char high, char low, String encoding) { boolean isInEncoding; try { char cArray[] = new char[2]; cArray[0] = high; cArray[1] = low; // Construct a String from the char String s = new String(cArray); // Encode the String into a sequence of bytes // using the given, named charset. byte[] bArray = s.getBytes(encoding); isInEncoding = inEncoding(high,bArray); } catch (Exception e) { isInEncoding = false; } return isInEncoding; } /** * This method is the core of determining if character * is in the encoding. The method is not foolproof, because * s.getBytes(encoding) has specified behavior only if the * characters are in the specified encoding. However this * method tries it's best. * @param ch the char that was converted using getBytes, or * the first char of a high/low pair that was converted. * @param data the bytes written out by the call to s.getBytes(encoding); * @return true if the character is in the encoding. */ private static boolean inEncoding(char ch, byte[] data) { final boolean isInEncoding; // If the string written out as data is not in the encoding, // the output is not specified according to the documentation // on the String.getBytes(encoding) method, // but we do our best here. if (data==null || data.length == 0) { isInEncoding = false; } else { if (data[0] == 0) isInEncoding = false; else if (data[0] == '?' && ch != '?') isInEncoding = false; /* * else if (isJapanese) { * // isJapanese is really * // ( "EUC-JP".equals(javaName) * // || "EUC_JP".equals(javaName) * // || "SJIS".equals(javaName) ) * * // Work around some bugs in JRE for Japanese * if(data[0] == 0x21) * isInEncoding = false; * else if (ch == 0xA5) * isInEncoding = false; * else * isInEncoding = true; * } */ else { // We don't know for sure, but it looks like it is in the encoding isInEncoding = true; } } return isInEncoding; } /** * This method exists for performance reasons. *
* Except for '\u0000', if a char is less than or equal to the value * returned by this method then it in the encoding. *
* The characters in an encoding are not contiguous, however * there is a lowest group of chars starting at '\u0001' upto and * including the char returned by this method that are all in the encoding. * So the char returned by this method essentially defines the lowest * contiguous group. *
* chars above the value returned might be in the encoding, but * chars at or below the value returned are definately in the encoding. *
* In any case however, the isInEncoding(char) method can be used * regardless of the value of the char returned by this method. *
* If the value returned is '\u0000' it means that every character must be tested * with an isInEncoding method {@link #isInEncoding(char)} or {@link #isInEncoding(char, char)} * for surrogate pairs. *
* This method is not a public API. * @xsl.usage internal */ public final char getHighChar() { return m_highCharInContiguousGroup; } }