/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * $Id: CharInfo.java 468654 2006-10-28 07:09:23Z minchau $ */ package org.apache.xml.serializer; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.util.Enumeration; import java.util.HashMap; import java.util.Hashtable; import java.util.PropertyResourceBundle; import java.util.ResourceBundle; import java.security.AccessController; import java.security.PrivilegedAction; import javax.xml.transform.TransformerException; import org.apache.xml.serializer.utils.MsgKey; import org.apache.xml.serializer.utils.SystemIDResolver; import org.apache.xml.serializer.utils.Utils; import org.apache.xml.serializer.utils.WrappedRuntimeException; /** * This class provides services that tell if a character should have * special treatement, such as entity reference substitution or normalization * of a newline character. It also provides character to entity reference * lookup. * * DEVELOPERS: See Known Issue in the constructor. * * @xsl.usage internal */ final class CharInfo { /** Given a character, lookup a String to output (e.g. a decorated entity reference). */ private HashMap m_charToString; /** * The name of the HTML entities file. * If specified, the file will be resource loaded with the default class loader. */ public static final String HTML_ENTITIES_RESOURCE = SerializerBase.PKG_NAME+".HTMLEntities"; /** * The name of the XML entities file. * If specified, the file will be resource loaded with the default class loader. */ public static final String XML_ENTITIES_RESOURCE = SerializerBase.PKG_NAME+".XMLEntities"; /** The horizontal tab character, which the parser should always normalize. */ static final char S_HORIZONAL_TAB = 0x09; /** The linefeed character, which the parser should always normalize. */ static final char S_LINEFEED = 0x0A; /** The carriage return character, which the parser should always normalize. */ static final char S_CARRIAGERETURN = 0x0D; static final char S_SPACE = 0x20; static final char S_QUOTE = 0x22; static final char S_LT = 0x3C; static final char S_GT = 0x3E; static final char S_NEL = 0x85; static final char S_LINE_SEPARATOR = 0x2028; /** This flag is an optimization for HTML entities. It false if entities * other than quot (34), amp (38), lt (60) and gt (62) are defined * in the range 0 to 127. * @xsl.usage internal */ boolean onlyQuotAmpLtGt; /** Copy the first 0,1 ... ASCII_MAX values into an array */ static final int ASCII_MAX = 128; /** Array of values is faster access than a set of bits * to quickly check ASCII characters in attribute values, * the value is true if the character in an attribute value * should be mapped to a String. */ private final boolean[] shouldMapAttrChar_ASCII; /** Array of values is faster access than a set of bits * to quickly check ASCII characters in text nodes, * the value is true if the character in a text node * should be mapped to a String. */ private final boolean[] shouldMapTextChar_ASCII; /** An array of bits to record if the character is in the set. * Although information in this array is complete, the * isSpecialAttrASCII array is used first because access to its values * is common and faster. */ private final int array_of_bits[]; // 5 for 32 bit words, 6 for 64 bit words ... /* * This constant is used to shift an integer to quickly * calculate which element its bit is stored in. * 5 for 32 bit words (int) , 6 for 64 bit words (long) */ private static final int SHIFT_PER_WORD = 5; /* * A mask to get the low order bits which are used to * calculate the value of the bit within a given word, * that will represent the presence of the integer in the * set. * * 0x1F for 32 bit words (int), * or 0x3F for 64 bit words (long) */ private static final int LOW_ORDER_BITMASK = 0x1f; /* * This is used for optimizing the lookup of bits representing * the integers in the set. It is the index of the first element * in the array array_of_bits[] that is not used. */ private int firstWordNotUsed; /** * A base constructor just to explicitly create the fields, * with the exception of m_charToString which is handled * by the constructor that delegates base construction to this one. *
* m_charToString is not created here only for performance reasons, * to avoid creating a Hashtable that will be replaced when * making a mutable copy, {@link #mutableCopyOf(CharInfo)}. * */ private CharInfo() { this.array_of_bits = createEmptySetOfIntegers(65535); this.firstWordNotUsed = 0; this.shouldMapAttrChar_ASCII = new boolean[ASCII_MAX]; this.shouldMapTextChar_ASCII = new boolean[ASCII_MAX]; this.m_charKey = new CharKey(); // Not set here, but in a constructor that uses this one // this.m_charToString = new Hashtable(); this.onlyQuotAmpLtGt = true; return; } private CharInfo(String entitiesResource, String method, boolean internal) { // call the default constructor to create the fields this(); m_charToString = new HashMap(); ResourceBundle entities = null; boolean noExtraEntities = true; // Make various attempts to interpret the parameter as a properties // file or resource file, as follows: // // 1) attempt to load .properties file using ResourceBundle // 2) try using the class loader to find the specified file a resource // file // 3) try treating the resource a URI if (internal) { try { // Load entity property files by using PropertyResourceBundle, // cause of security issure for applets entities = PropertyResourceBundle.getBundle(entitiesResource); } catch (Exception e) {} } if (entities != null) { Enumeration keys = entities.getKeys(); while (keys.hasMoreElements()){ String name = (String) keys.nextElement(); String value = entities.getString(name); int code = Integer.parseInt(value); boolean extra = defineEntity(name, (char) code); if (extra) noExtraEntities = false; } } else { InputStream is = null; // Load user specified resource file by using URL loading, it // requires a valid URI as parameter try { if (internal) { is = CharInfo.class.getResourceAsStream(entitiesResource); } else { ClassLoader cl = ObjectFactory.findClassLoader(); if (cl == null) { is = ClassLoader.getSystemResourceAsStream(entitiesResource); } else { is = cl.getResourceAsStream(entitiesResource); } if (is == null) { try { URL url = new URL(entitiesResource); is = url.openStream(); } catch (Exception e) {} } } if (is == null) { throw new RuntimeException( Utils.messages.createMessage( MsgKey.ER_RESOURCE_COULD_NOT_FIND, new Object[] {entitiesResource, entitiesResource})); } // Fix Bugzilla#4000: force reading in UTF-8 // This creates the de facto standard that Xalan's resource // files must be encoded in UTF-8. This should work in all // JVMs. // // %REVIEW% KNOWN ISSUE: IT FAILS IN MICROSOFT VJ++, which // didn't implement the UTF-8 encoding. Theoretically, we should // simply let it fail in that case, since the JVM is obviously // broken if it doesn't support such a basic standard. But // since there are still some users attempting to use VJ++ for // development, we have dropped in a fallback which makes a // second attempt using the platform's default encoding. In VJ++ // this is apparently ASCII, which is subset of UTF-8... and // since the strings we'll be reading here are also primarily // limited to the 7-bit ASCII range (at least, in English // versions of Xalan), this should work well enough to keep us // on the air until we're ready to officially decommit from // VJ++. BufferedReader reader; try { reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); } catch (UnsupportedEncodingException e) { reader = new BufferedReader(new InputStreamReader(is)); } String line = reader.readLine(); while (line != null) { if (line.length() == 0 || line.charAt(0) == '#') { line = reader.readLine(); continue; } int index = line.indexOf(' '); if (index > 1) { String name = line.substring(0, index); ++index; if (index < line.length()) { String value = line.substring(index); index = value.indexOf(' '); if (index > 0) { value = value.substring(0, index); } int code = Integer.parseInt(value); boolean extra = defineEntity(name, (char) code); if (extra) noExtraEntities = false; } } line = reader.readLine(); } is.close(); } catch (Exception e) { throw new RuntimeException( Utils.messages.createMessage( MsgKey.ER_RESOURCE_COULD_NOT_LOAD, new Object[] { entitiesResource, e.toString(), entitiesResource, e.toString()})); } finally { if (is != null) { try { is.close(); } catch (Exception except) {} } } } onlyQuotAmpLtGt = noExtraEntities; /* Now that we've used get(ch) just above to initialize the * two arrays we will change by adding a tab to the set of * special chars for XML (but not HTML!). * We do this because a tab is always a * special character in an XML attribute, * but only a special character in XML text * if it has an entity defined for it. * This is the reason for this delay. */ if (Method.XML.equals(method)) { // We choose not to escape the quotation mark as " in text nodes shouldMapTextChar_ASCII[S_QUOTE] = false; } if (Method.HTML.equals(method)) { // The XSLT 1.0 recommendation says // "The html output method should not escape < characters occurring in attribute values." // So we don't escape '<' in an attribute for HTML shouldMapAttrChar_ASCII['<'] = false; // We choose not to escape the quotation mark as " in text nodes. shouldMapTextChar_ASCII[S_QUOTE] = false; } } /** * Defines a new character reference. The reference's name and value are * supplied. Nothing happens if the character reference is already defined. *
Unlike internal entities, character references are a string to single * character mapping. They are used to map non-ASCII characters both on * parsing and printing, primarily for HTML documents. '<' is an * example of a character reference.
* * @param name The entity's name * @param value The entity's value * @return true if the mapping is not one of: ** # First char # is a comment * Entity numericValue * quot 34 * amp 38 ** (Note: Why don't we just switch to .properties files? Oct-01 -sc) * * @param entitiesResource Name of entities resource file that should * be loaded, which describes that mapping of characters to entity references. * @param method the output method type, which should be one of "xml", "html", "text"... * * @xsl.usage internal */ static CharInfo getCharInfo(String entitiesFileName, String method) { CharInfo charInfo = (CharInfo) m_getCharInfoCache.get(entitiesFileName); if (charInfo != null) { return mutableCopyOf(charInfo); } // try to load it internally - cache try { charInfo = getCharInfoBasedOnPrivilege(entitiesFileName, method, true); // Put the common copy of charInfo in the cache, but return // a copy of it. m_getCharInfoCache.put(entitiesFileName, charInfo); return mutableCopyOf(charInfo); } catch (Exception e) {} // try to load it externally - do not cache try { return getCharInfoBasedOnPrivilege(entitiesFileName, method, false); } catch (Exception e) {} String absoluteEntitiesFileName; if (entitiesFileName.indexOf(':') < 0) { absoluteEntitiesFileName = SystemIDResolver.getAbsoluteURIFromRelative(entitiesFileName); } else { try { absoluteEntitiesFileName = SystemIDResolver.getAbsoluteURI(entitiesFileName, null); } catch (TransformerException te) { throw new WrappedRuntimeException(te); } } return getCharInfoBasedOnPrivilege(entitiesFileName, method, false); } /** * Create a mutable copy of the cached one. * @param charInfo The cached one. * @return */ private static CharInfo mutableCopyOf(CharInfo charInfo) { CharInfo copy = new CharInfo(); int max = charInfo.array_of_bits.length; System.arraycopy(charInfo.array_of_bits,0,copy.array_of_bits,0,max); copy.firstWordNotUsed = charInfo.firstWordNotUsed; max = charInfo.shouldMapAttrChar_ASCII.length; System.arraycopy(charInfo.shouldMapAttrChar_ASCII,0,copy.shouldMapAttrChar_ASCII,0,max); max = charInfo.shouldMapTextChar_ASCII.length; System.arraycopy(charInfo.shouldMapTextChar_ASCII,0,copy.shouldMapTextChar_ASCII,0,max); // utility field copy.m_charKey is already created in the default constructor copy.m_charToString = (HashMap) charInfo.m_charToString.clone(); copy.onlyQuotAmpLtGt = charInfo.onlyQuotAmpLtGt; return copy; } /** * Table of user-specified char infos. * The table maps entify file names (the name of the * property file without the .properties extension) * to CharInfo objects populated with entities defined in * corresponding property file. */ private static Hashtable m_getCharInfoCache = new Hashtable(); /** * Returns the array element holding the bit value for the * given integer * @param i the integer that might be in the set of integers * */ private static int arrayIndex(int i) { return (i >> SHIFT_PER_WORD); } /** * For a given integer in the set it returns the single bit * value used within a given word that represents whether * the integer is in the set or not. */ private static int bit(int i) { int ret = (1 << (i & LOW_ORDER_BITMASK)); return ret; } /** * Creates a new empty set of integers (characters) * @param max the maximum integer to be in the set. */ private int[] createEmptySetOfIntegers(int max) { firstWordNotUsed = 0; // an optimization int[] arr = new int[arrayIndex(max - 1) + 1]; return arr; } /** * Adds the integer (character) to the set of integers. * @param i the integer to add to the set, valid values are * 0, 1, 2 ... up to the maximum that was specified at * the creation of the set. */ private final void set(int i) { setASCIItextDirty(i); setASCIIattrDirty(i); int j = (i >> SHIFT_PER_WORD); // this word is used int k = j + 1; if(firstWordNotUsed < k) // for optimization purposes. firstWordNotUsed = k; array_of_bits[j] |= (1 << (i & LOW_ORDER_BITMASK)); } /** * Return true if the integer (character)is in the set of integers. * * This implementation uses an array of integers with 32 bits per * integer. If a bit is set to 1 the corresponding integer is * in the set of integers. * * @param i an integer that is tested to see if it is the * set of integers, or not. */ private final boolean get(int i) { boolean in_the_set = false; int j = (i >> SHIFT_PER_WORD); // wordIndex(i) // an optimization here, ... a quick test to see // if this integer is beyond any of the words in use if(j < firstWordNotUsed) in_the_set = (array_of_bits[j] & (1 << (i & LOW_ORDER_BITMASK)) ) != 0; // 0L for 64 bit words return in_the_set; } /** * This method returns true if there are some non-standard mappings to * entities other than quot, amp, lt, gt, and its only purpose is for * performance. * @param charToMap The value of the character that is mapped to a String * @param outputString The String to which the character is mapped, usually * an entity reference such as "<". * @return true if the mapping is not one of: *