1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2004-2005, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: xmlparser.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2004jul21 16 * created by: Andy Heninger 17 * 18 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools. 19 * Not suitable for production use. Not supported. 20 * Not conformant. Not efficient. 21 * But very small. 22 */ 23 24 #ifndef __XMLPARSER_H__ 25 #define __XMLPARSER_H__ 26 27 #include "unicode/uobject.h" 28 #include "unicode/unistr.h" 29 #include "unicode/regex.h" 30 #include "uvector.h" 31 #include "hash.h" 32 33 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION 34 35 enum UXMLNodeType { 36 /** Node type string (text contents), stored as a UnicodeString. */ 37 UXML_NODE_TYPE_STRING, 38 /** Node type element, stored as a UXMLElement. */ 39 UXML_NODE_TYPE_ELEMENT, 40 UXML_NODE_TYPE_COUNT 41 }; 42 43 U_NAMESPACE_BEGIN 44 45 class UXMLParser; 46 47 /** 48 * This class represents an element node in a parsed XML tree. 49 */ 50 class U_TOOLUTIL_API UXMLElement : public UObject { 51 public: 52 /** 53 * Destructor. 54 */ 55 virtual ~UXMLElement(); 56 57 /** 58 * Get the tag name of this element. 59 */ 60 const UnicodeString &getTagName() const; 61 /** 62 * Get the text contents of the element. 63 * Append the contents of all text child nodes. 64 * @param recurse If true, also recursively appends the contents of all 65 * text child nodes of element children. 66 * @return The text contents. 67 */ 68 UnicodeString getText(UBool recurse) const; 69 /** 70 * Get the number of attributes. 71 */ 72 int32_t countAttributes() const; 73 /** 74 * Get the i-th attribute. 75 * @param i Index of the attribute. 76 * @param name Output parameter, receives the attribute name. 77 * @param value Output parameter, receives the attribute value. 78 * @return A pointer to the attribute value (may be &value or a pointer to an 79 * internal string object), or nullptr if i is out of bounds. 80 */ 81 const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const; 82 /** 83 * Get the value of the attribute with the given name. 84 * @param name Attribute name to be looked up. 85 * @return A pointer to the attribute value, or nullptr if this element 86 * does not have this attribute. 87 */ 88 const UnicodeString *getAttribute(const UnicodeString &name) const; 89 /** 90 * Get the number of child nodes. 91 */ 92 int32_t countChildren() const; 93 /** 94 * Get the i-th child node. 95 * @param i Index of the child node. 96 * @param type The child node type. 97 * @return A pointer to the child node object, or nullptr if i is out of bounds. 98 */ 99 const UObject *getChild(int32_t i, UXMLNodeType &type) const; 100 /** 101 * Get the next child element node, skipping non-element child nodes. 102 * @param i Enumeration index; initialize to 0 before getting the first child element. 103 * @return A pointer to the next child element, or nullptr if there is none. 104 */ 105 const UXMLElement *nextChildElement(int32_t &i) const; 106 /** 107 * Get the immediate child element with the given name. 108 * If there are multiple child elements with this name, then return 109 * the first one. 110 * @param name Element name to be looked up. 111 * @return A pointer to the element node, or nullptr if this element 112 * does not have this immediate child element. 113 */ 114 const UXMLElement *getChildElement(const UnicodeString &name) const; 115 116 /** 117 * ICU "poor man's RTTI", returns a UClassID for the actual class. 118 */ 119 virtual UClassID getDynamicClassID() const override; 120 121 /** 122 * ICU "poor man's RTTI", returns a UClassID for this class. 123 */ 124 static UClassID U_EXPORT2 getStaticClassID(); 125 126 private: 127 // prevent default construction etc. 128 UXMLElement(); 129 UXMLElement(const UXMLElement &other); 130 UXMLElement &operator=(const UXMLElement &other); 131 132 void appendText(UnicodeString &text, UBool recurse) const; 133 134 friend class UXMLParser; 135 136 UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode); 137 138 const UXMLParser *fParser; 139 const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser) 140 UnicodeString fContent; // The text content of this node. All element content is 141 // concatenated even when there are intervening nested elements 142 // (which doesn't happen with most xml files we care about) 143 // Sections of content containing only white space are dropped, 144 // which gets rid the bogus white space content from 145 // elements which are primarily containers for nested elements. 146 UVector fAttNames; // A vector containing the names of this element's attributes 147 // The names are UnicodeString objects, owned by the UXMLParser. 148 UVector fAttValues; // A vector containing the attribute values for 149 // this element's attributes. The order is the same 150 // as that of the attribute name vector. 151 152 UVector fChildren; // The child nodes of this element (a Vector) 153 154 UXMLElement *fParent; // A pointer to the parent element of this element. 155 }; 156 157 /** 158 * A simple XML parser; it is neither efficient nor conformant and only useful for 159 * restricted types of XML documents. 160 * 161 * The parse methods parse whole documents and return the parse trees via their 162 * root elements. 163 */ 164 class U_TOOLUTIL_API UXMLParser : public UObject { 165 public: 166 /** 167 * Create an XML parser. 168 */ 169 static UXMLParser *createParser(UErrorCode &errorCode); 170 /** 171 * Destructor. 172 */ 173 virtual ~UXMLParser(); 174 175 /** 176 * Parse an XML document, create the entire document tree, and 177 * return a pointer to the root element of the parsed tree. 178 * The caller must delete the element. 179 */ 180 UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode); 181 /** 182 * Parse an XML file, create the entire document tree, and 183 * return a pointer to the root element of the parsed tree. 184 * The caller must delete the element. 185 */ 186 UXMLElement *parseFile(const char *filename, UErrorCode &errorCode); 187 188 /** 189 * ICU "poor man's RTTI", returns a UClassID for the actual class. 190 */ 191 virtual UClassID getDynamicClassID() const override; 192 193 /** 194 * ICU "poor man's RTTI", returns a UClassID for this class. 195 */ 196 static UClassID U_EXPORT2 getStaticClassID(); 197 198 private: 199 // prevent default construction etc. 200 UXMLParser(); 201 UXMLParser(const UXMLParser &other); 202 UXMLParser &operator=(const UXMLParser &other); 203 204 // constructor 205 UXMLParser(UErrorCode &status); 206 207 void parseMisc(UErrorCode &status); 208 UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status); 209 void error(const char *message, UErrorCode &status); 210 UnicodeString scanContent(UErrorCode &status); 211 void replaceCharRefs(UnicodeString &s, UErrorCode &status); 212 213 const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode); 214 public: 215 // public for UXMLElement only 216 const UnicodeString *findName(const UnicodeString &s) const; 217 private: 218 219 // There is one ICU regex matcher for each of the major XML syntax items 220 // that are recognized. 221 RegexMatcher mXMLDecl; 222 RegexMatcher mXMLComment; 223 RegexMatcher mXMLSP; 224 RegexMatcher mXMLDoctype; 225 RegexMatcher mXMLPI; 226 RegexMatcher mXMLElemStart; 227 RegexMatcher mXMLElemEnd; 228 RegexMatcher mXMLElemEmpty; 229 RegexMatcher mXMLCharData; 230 RegexMatcher mAttrValue; 231 RegexMatcher mAttrNormalizer; 232 RegexMatcher mNewLineNormalizer; 233 RegexMatcher mAmps; 234 235 Hashtable fNames; // interned element/attribute name strings 236 UStack fElementStack; // Stack holds the parent elements when nested 237 // elements are being parsed. All items on this 238 // stack are of type UXMLElement. 239 int32_t fPos; // String index of the current scan position in 240 // xml source (in fSrc). 241 UnicodeString fOneLF; 242 }; 243 244 U_NAMESPACE_END 245 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 246 247 #endif 248