• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2004-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  xmlparser.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2004jul21
14 *   created by: Andy Heninger
15 */
16 
17 #include <stdio.h>
18 #include "unicode/uchar.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/regex.h"
21 #include "filestrm.h"
22 #include "xmlparser.h"
23 
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
25 
26 // character constants
27 enum {
28     x_QUOT=0x22,
29     x_AMP=0x26,
30     x_APOS=0x27,
31     x_LT=0x3c,
32     x_GT=0x3e,
33     x_l=0x6c
34 };
35 
36 #define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
37 
38 // XML #4
39 #define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
40                     "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
41                     "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
42                     "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
43 
44 //  XML #5
45 #define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
46 
47 //  XML #6
48 #define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
49 
50 U_NAMESPACE_BEGIN
51 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
54 
55 //
56 //   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
57 //                             used for parsing.
58 //
59 UXMLParser::UXMLParser(UErrorCode &status) :
60       //  XML Declaration.  XML Production #23.
61       //      example:  "<?xml version=1.0 encoding="utf-16" ?>
62       //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
63       //            allow for a possible leading BOM.
64       mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
65 
66       //  XML Comment   production #15
67       //     example:  "<!-- whatever -->
68       //       note, does not detect an illegal "--" within comments
69       mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
70 
71       //  XML Spaces
72       //      production [3]
73       mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
74 
75       //  XML Doctype decl  production #28
76       //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
77       //       or      "<!DOCTYPE foo [internal dtd]>
78       //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
79       //           Some internal dtd subsets could confuse this simple-minded
80       //           attempt at skipping over them, specifically, occcurences
81       //           of closeing square brackets.  These could appear in comments,
82       //           or in parameter entity declarations, for example.
83       mXMLDoctype(UnicodeString(
84            "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
85            ), 0, status),
86 
87       //  XML PI     production #16
88       //     example   "<?target stuff?>
89       mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
90 
91       //  XML Element Start   Productions #40, #41
92       //          example   <foo att1='abc'  att2="d e f" >
93       //      capture #1:  the tag name
94       //
95       mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
96           "(?:"
97                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
98                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
99           ")*"                                                             //   * for zero or more attributes.
100           XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
101 
102       //  XML Element End     production #42
103       //     example   </foo>
104       mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
105 
106       // XML Element Empty    production #44
107       //     example   <foo att1="abc"   att2="d e f" />
108       mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
109           "(?:"
110                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
111                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
112           ")*"                                                             //   * for zero or more attributes.
113           XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
114 
115 
116       // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
117       mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
118 
119       // Attribute name = "value".  XML Productions 10, 40/41
120       //  Capture group 1 is name,
121       //                2 is the attribute value, including the quotes.
122       //
123       //   Note that attributes are scanned twice.  The first time is with
124       //        the regex for an entire element start.  There, the attributes
125       //        are checked syntactically, but not separted out one by one.
126       //        Here, we match a single attribute, and make its name and
127       //        attribute value available to the parser code.
128       mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
129          "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
130 
131 
132       mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
133 
134       // Match any of the new-line sequences in content.
135       //   All are changed to \u000a.
136       mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
137 
138       // & char references
139       //   We will figure out what we've got based on which capture group has content.
140       //   The last one is a catchall for unrecognized entity references..
141       //             1     2     3      4      5           6                    7          8
142       mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
143                 0, status),
144 
145       fNames(status),
146       fElementStack(status),
147       fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
148       {
149       }
150 
151 UXMLParser *
createParser(UErrorCode & errorCode)152 UXMLParser::createParser(UErrorCode &errorCode) {
153     if (U_FAILURE(errorCode)) {
154         return NULL;
155     } else {
156         return new UXMLParser(errorCode);
157     }
158 }
159 
~UXMLParser()160 UXMLParser::~UXMLParser() {}
161 
162 UXMLElement *
parseFile(const char * filename,UErrorCode & errorCode)163 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
164     char bytes[4096], charsetBuffer[100];
165     FileStream *f;
166     const char *charset, *pb;
167     UnicodeString src;
168     UConverter *cnv;
169     UChar *buffer, *pu;
170     int32_t fileLength, bytesLength, length, capacity;
171     UBool flush;
172 
173     if(U_FAILURE(errorCode)) {
174         return NULL;
175     }
176 
177     f=T_FileStream_open(filename, "rb");
178     if(f==NULL) {
179         errorCode=U_FILE_ACCESS_ERROR;
180         return NULL;
181     }
182 
183     bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
184     if(bytesLength<(int32_t)sizeof(bytes)) {
185         // we have already read the entire file
186         fileLength=bytesLength;
187     } else {
188         // get the file length
189         fileLength=T_FileStream_size(f);
190     }
191 
192     /*
193      * get the charset:
194      * 1. Unicode signature
195      * 2. treat as ISO-8859-1 and read XML encoding="charser"
196      * 3. default to UTF-8
197      */
198     charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
199     if(U_SUCCESS(errorCode) && charset!=NULL) {
200         // open converter according to Unicode signature
201         cnv=ucnv_open(charset, &errorCode);
202     } else {
203         // read as Latin-1 and parse the XML declaration and encoding
204         cnv=ucnv_open("ISO-8859-1", &errorCode);
205         if(U_FAILURE(errorCode)) {
206             // unexpected error opening Latin-1 converter
207             goto exit;
208         }
209 
210         buffer=src.getBuffer(bytesLength);
211         if(buffer==NULL) {
212             // unexpected failure to reserve some string capacity
213             errorCode=U_MEMORY_ALLOCATION_ERROR;
214             goto exit;
215         }
216         pb=bytes;
217         pu=buffer;
218         ucnv_toUnicode(
219             cnv,
220             &pu, buffer+src.getCapacity(),
221             &pb, bytes+bytesLength,
222             NULL, TRUE, &errorCode);
223         src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
224         ucnv_close(cnv);
225         cnv=NULL;
226         if(U_FAILURE(errorCode)) {
227             // unexpected error in conversion from Latin-1
228             src.remove();
229             goto exit;
230         }
231 
232         // parse XML declaration
233         if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
234             int32_t declEnd=mXMLDecl.end(errorCode);
235             // go beyond <?xml
236             int32_t pos=src.indexOf((UChar)x_l)+1;
237 
238             mAttrValue.reset(src);
239             while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
240                 UnicodeString attName  = mAttrValue.group(1, errorCode);
241                 UnicodeString attValue = mAttrValue.group(2, errorCode);
242 
243                 // Trim the quotes from the att value.  These are left over from the original regex
244                 //   that parsed the attribue, which couldn't conveniently strip them.
245                 attValue.remove(0,1);                    // one char from the beginning
246                 attValue.truncate(attValue.length()-1);  // and one from the end.
247 
248                 if(attName==UNICODE_STRING("encoding", 8)) {
249                     length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
250                     charset=charsetBuffer;
251                     break;
252                 }
253                 pos = mAttrValue.end(2, errorCode);
254             }
255 
256             if(charset==NULL) {
257                 // default to UTF-8
258                 charset="UTF-8";
259             }
260             cnv=ucnv_open(charset, &errorCode);
261         }
262     }
263 
264     if(U_FAILURE(errorCode)) {
265         // unable to open the converter
266         goto exit;
267     }
268 
269     // convert the file contents
270     capacity=fileLength;        // estimated capacity
271     src.getBuffer(capacity);
272     src.releaseBuffer(0);       // zero length
273     flush=FALSE;
274     for(;;) {
275         // convert contents of bytes[bytesLength]
276         pb=bytes;
277         for(;;) {
278             length=src.length();
279             buffer=src.getBuffer(capacity);
280             if(buffer==NULL) {
281                 // unexpected failure to reserve some string capacity
282                 errorCode=U_MEMORY_ALLOCATION_ERROR;
283                 goto exit;
284             }
285 
286             pu=buffer+length;
287             ucnv_toUnicode(
288                 cnv, &pu, buffer+src.getCapacity(),
289                 &pb, bytes+bytesLength,
290                 NULL, FALSE, &errorCode);
291             src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
292             if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
293                 errorCode=U_ZERO_ERROR;
294                 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
295             } else {
296                 break;
297             }
298         }
299 
300         if(U_FAILURE(errorCode)) {
301             break; // conversion error
302         }
303 
304         if(flush) {
305             break; // completely converted the file
306         }
307 
308         // read next block
309         bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
310         if(bytesLength==0) {
311             // reached end of file, convert once more to flush the converter
312             flush=TRUE;
313         }
314     };
315 
316 exit:
317     ucnv_close(cnv);
318     T_FileStream_close(f);
319 
320     if(U_SUCCESS(errorCode)) {
321         return parse(src, errorCode);
322     } else {
323         return NULL;
324     }
325 }
326 
327 UXMLElement *
parse(const UnicodeString & src,UErrorCode & status)328 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
329     if(U_FAILURE(status)) {
330         return NULL;
331     }
332 
333     UXMLElement   *root = NULL;
334     fPos = 0; // TODO use just a local pos variable and pass it into functions
335               // where necessary?
336 
337     // set all matchers to work on the input string
338     mXMLDecl.reset(src);
339     mXMLComment.reset(src);
340     mXMLSP.reset(src);
341     mXMLDoctype.reset(src);
342     mXMLPI.reset(src);
343     mXMLElemStart.reset(src);
344     mXMLElemEnd.reset(src);
345     mXMLElemEmpty.reset(src);
346     mXMLCharData.reset(src);
347     mAttrValue.reset(src);
348     mAttrNormalizer.reset(src);
349     mNewLineNormalizer.reset(src);
350     mAmps.reset(src);
351 
352     // Consume the XML Declaration, if present.
353     if (mXMLDecl.lookingAt(fPos, status)) {
354         fPos = mXMLDecl.end(status);
355     }
356 
357     // Consume "misc" [XML production 27] appearing before DocType
358     parseMisc(status);
359 
360     // Consume a DocType declaration, if present.
361     if (mXMLDoctype.lookingAt(fPos, status)) {
362         fPos = mXMLDoctype.end(status);
363     }
364 
365     // Consume additional "misc" [XML production 27] appearing after the DocType
366     parseMisc(status);
367 
368     // Get the root element
369     if (mXMLElemEmpty.lookingAt(fPos, status)) {
370         // Root is an empty element (no nested elements or content)
371         root = createElement(mXMLElemEmpty, status);
372         fPos = mXMLElemEmpty.end(status);
373     } else {
374         if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
375             error("Root Element expected", status);
376             goto errorExit;
377         }
378         root = createElement(mXMLElemStart, status);
379         UXMLElement  *el = root;
380 
381         //
382         // This is the loop that consumes the root element of the document,
383         //      including all nested content.   Nested elements are handled by
384         //      explicit pushes/pops of the element stack; there is no recursion
385         //      in the control flow of this code.
386         //      "el" always refers to the current element, the one to which content
387         //      is being added.  It is above the top of the element stack.
388         for (;;) {
389             // Nested Element Start
390             if (mXMLElemStart.lookingAt(fPos, status)) {
391                 UXMLElement *t = createElement(mXMLElemStart, status);
392                 el->fChildren.addElement(t, status);
393                 t->fParent = el;
394                 fElementStack.push(el, status);
395                 el = t;
396                 continue;
397             }
398 
399             // Text Content.  String is concatenated onto the current node's content,
400             //                but only if it contains something other than spaces.
401             UnicodeString s = scanContent(status);
402             if (s.length() > 0) {
403                 mXMLSP.reset(s);
404                 if (mXMLSP.matches(status) == FALSE) {
405                     // This chunk of text contains something other than just
406                     //  white space. Make a child node for it.
407                     replaceCharRefs(s, status);
408                     el->fChildren.addElement(s.clone(), status);
409                 }
410                 mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
411                 continue;
412             }
413 
414             // Comments.  Discard.
415             if (mXMLComment.lookingAt(fPos, status)) {
416                 fPos = mXMLComment.end(status);
417                 continue;
418             }
419 
420             // PIs.  Discard.
421             if (mXMLPI.lookingAt(fPos, status)) {
422                 fPos = mXMLPI.end(status);
423                 continue;
424             }
425 
426             // Element End
427             if (mXMLElemEnd.lookingAt(fPos, status)) {
428                 fPos = mXMLElemEnd.end(0, status);
429                 const UnicodeString name = mXMLElemEnd.group(1, status);
430                 if (name != *el->fName) {
431                     error("Element start / end tag mismatch", status);
432                     goto errorExit;
433                 }
434                 if (fElementStack.empty()) {
435                     // Close of the root element.  We're done with the doc.
436                     el = NULL;
437                     break;
438                 }
439                 el = (UXMLElement *)fElementStack.pop();
440                 continue;
441             }
442 
443             // Empty Element.  Stored as a child of the current element, but not stacked.
444             if (mXMLElemEmpty.lookingAt(fPos, status)) {
445                 UXMLElement *t = createElement(mXMLElemEmpty, status);
446                 el->fChildren.addElement(t, status);
447                 continue;
448             }
449 
450             // Hit something within the document that doesn't match anything.
451             //   It's an error.
452             error("Unrecognized markup", status);
453             break;
454         }
455 
456         if (el != NULL || !fElementStack.empty()) {
457             // We bailed out early, for some reason.
458             error("Root element not closed.", status);
459             goto errorExit;
460         }
461     }
462 
463     // Root Element parse is complete.
464     // Consume the annoying xml "Misc" that can appear at the end of the doc.
465     parseMisc(status);
466 
467     // We should have reached the end of the input
468     if (fPos != src.length()) {
469         error("Extra content at the end of the document", status);
470         goto errorExit;
471     }
472 
473     // Success!
474     return root;
475 
476 errorExit:
477     delete root;
478     return NULL;
479 }
480 
481 //
482 //  createElement
483 //      We've just matched an element start tag.  Create and fill in a UXMLElement object
484 //      for it.
485 //
486 UXMLElement *
createElement(RegexMatcher & mEl,UErrorCode & status)487 UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
488     // First capture group is the element's name.
489     UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
490 
491     // Scan for attributes.
492     int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
493 
494     while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
495         UnicodeString attName  = mAttrValue.group(1, status);
496         UnicodeString attValue = mAttrValue.group(2, status);
497 
498         // Trim the quotes from the att value.  These are left over from the original regex
499         //   that parsed the attribue, which couldn't conveniently strip them.
500         attValue.remove(0,1);                    // one char from the beginning
501         attValue.truncate(attValue.length()-1);  // and one from the end.
502 
503         // XML Attribue value normalization.
504         // This is one of the really screwy parts of the XML spec.
505         // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
506         // Note that non-validating parsers must treat all entities as type CDATA
507         //   which simplifies things some.
508 
509         // Att normalization step 1:  normalize any newlines in the attribute value
510         mNewLineNormalizer.reset(attValue);
511         attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
512 
513         // Next change all xml white space chars to plain \u0020 spaces.
514         mAttrNormalizer.reset(attValue);
515         UnicodeString oneSpace((UChar)0x0020);
516         attValue = mAttrNormalizer.replaceAll(oneSpace, status);
517 
518         // Replace character entities.
519         replaceCharRefs(attValue, status);
520 
521         // Save the attribute name and value in our document structure.
522         el->fAttNames.addElement((void *)intern(attName, status), status);
523         el->fAttValues.addElement(attValue.clone(), status);
524         pos = mAttrValue.end(2, status);
525     }
526     fPos = mEl.end(0, status);
527     return el;
528 }
529 
530 //
531 //  parseMisc
532 //     Consume XML "Misc" [production #27]
533 //        which is any combination of space, PI and comments
534 //      Need to watch end-of-input because xml MISC stuff is allowed after
535 //        the document element, so we WILL scan off the end in this function
536 //
537 void
parseMisc(UErrorCode & status)538 UXMLParser::parseMisc(UErrorCode &status)  {
539     for (;;) {
540         if (fPos >= mXMLPI.input().length()) {
541             break;
542         }
543         if (mXMLPI.lookingAt(fPos, status)) {
544             fPos = mXMLPI.end(status);
545             continue;
546         }
547         if (mXMLSP.lookingAt(fPos, status)) {
548             fPos = mXMLSP.end(status);
549             continue;
550         }
551         if (mXMLComment.lookingAt(fPos, status)) {
552             fPos = mXMLComment.end(status);
553             continue;
554         }
555         break;
556     }
557 }
558 
559 //
560 //  Scan for document content.
561 //
562 UnicodeString
scanContent(UErrorCode & status)563 UXMLParser::scanContent(UErrorCode &status) {
564     UnicodeString  result;
565     if (mXMLCharData.lookingAt(fPos, status)) {
566         result = mXMLCharData.group((int32_t)0, status);
567         // Normalize the new-lines.  (Before char ref substitution)
568         mNewLineNormalizer.reset(result);
569         result = mNewLineNormalizer.replaceAll(fOneLF, status);
570 
571         // TODO:  handle CDATA
572         fPos = mXMLCharData.end(0, status);
573     }
574 
575     return result;
576 }
577 
578 //
579 //   replaceCharRefs
580 //
581 //      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
582 //       with the corresponding actual character.
583 //
584 void
replaceCharRefs(UnicodeString & s,UErrorCode & status)585 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
586     UnicodeString result;
587     UnicodeString replacement;
588     int     i;
589 
590     mAmps.reset(s);
591     // See the initialization for the regex matcher mAmps.
592     //    Which entity we've matched is determined by which capture group has content,
593     //      which is flaged by start() of that group not being -1.
594     while (mAmps.find()) {
595         if (mAmps.start(1, status) != -1) {
596             replacement.setTo((UChar)x_AMP);
597         } else if (mAmps.start(2, status) != -1) {
598             replacement.setTo((UChar)x_LT);
599         } else if (mAmps.start(3, status) != -1) {
600             replacement.setTo((UChar)x_GT);
601         } else if (mAmps.start(4, status) != -1) {
602             replacement.setTo((UChar)x_APOS);
603         } else if (mAmps.start(5, status) != -1) {
604             replacement.setTo((UChar)x_QUOT);
605         } else if (mAmps.start(6, status) != -1) {
606             UnicodeString hexString = mAmps.group(6, status);
607             UChar32 val = 0;
608             for (i=0; i<hexString.length(); i++) {
609                 val = (val << 4) + u_digit(hexString.charAt(i), 16);
610             }
611             // TODO:  some verification that the character is valid
612             replacement.setTo(val);
613         } else if (mAmps.start(7, status) != -1) {
614             UnicodeString decimalString = mAmps.group(7, status);
615             UChar32 val = 0;
616             for (i=0; i<decimalString.length(); i++) {
617                 val = val*10 + u_digit(decimalString.charAt(i), 10);
618             }
619             // TODO:  some verification that the character is valid
620             replacement.setTo(val);
621         } else {
622             // An unrecognized &entity;  Leave it alone.
623             //  TODO:  check that it really looks like an entity, and is not some
624             //         random & in the text.
625             replacement = mAmps.group((int32_t)0, status);
626         }
627         mAmps.appendReplacement(result, replacement, status);
628     }
629     mAmps.appendTail(result);
630     s = result;
631 }
632 
633 void
error(const char * message,UErrorCode & status)634 UXMLParser::error(const char *message, UErrorCode &status) {
635     // TODO:  something better here...
636     const UnicodeString &src=mXMLDecl.input();
637     int  line = 0;
638     int  ci = 0;
639     while (ci < fPos && ci>=0) {
640         ci = src.indexOf((UChar)0x0a, ci+1);
641         line++;
642     }
643     fprintf(stderr, "Error: %s at line %d\n", message, line);
644     if (U_SUCCESS(status)) {
645         status = U_PARSE_ERROR;
646     }
647 }
648 
649 // intern strings like in Java
650 
651 const UnicodeString *
intern(const UnicodeString & s,UErrorCode & errorCode)652 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
653     const UHashElement *he=fNames.find(s);
654     if(he!=NULL) {
655         // already a known name, return its hashed key pointer
656         return (const UnicodeString *)he->key.pointer;
657     } else {
658         // add this new name and return its hashed key pointer
659         fNames.puti(s, 0, errorCode);
660         he=fNames.find(s);
661         return (const UnicodeString *)he->key.pointer;
662     }
663 }
664 
665 const UnicodeString *
findName(const UnicodeString & s) const666 UXMLParser::findName(const UnicodeString &s) const {
667     const UHashElement *he=fNames.find(s);
668     if(he!=NULL) {
669         // a known name, return its hashed key pointer
670         return (const UnicodeString *)he->key.pointer;
671     } else {
672         // unknown name
673         return NULL;
674     }
675 }
676 
677 // UXMLElement ------------------------------------------------------------- ***
678 
UXMLElement(const UXMLParser * parser,const UnicodeString * name,UErrorCode & errorCode)679 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
680    fParser(parser),
681    fName(name),
682    fAttNames(errorCode),
683    fAttValues(errorCode),
684    fChildren(errorCode),
685    fParent(NULL)
686 {
687 }
688 
~UXMLElement()689 UXMLElement::~UXMLElement() {
690     int   i;
691     // attribute names are owned by the UXMLParser, don't delete them here
692     for (i=fAttValues.size()-1; i>=0; i--) {
693         delete (UObject *)fAttValues.elementAt(i);
694     }
695     for (i=fChildren.size()-1; i>=0; i--) {
696         delete (UObject *)fChildren.elementAt(i);
697     }
698 }
699 
700 const UnicodeString &
getTagName() const701 UXMLElement::getTagName() const {
702     return *fName;
703 }
704 
705 UnicodeString
getText(UBool recurse) const706 UXMLElement::getText(UBool recurse) const {
707     UnicodeString text;
708     appendText(text, recurse);
709     return text;
710 }
711 
712 void
appendText(UnicodeString & text,UBool recurse) const713 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
714     const UObject *node;
715     int32_t i, count=fChildren.size();
716     for(i=0; i<count; ++i) {
717         node=(const UObject *)fChildren.elementAt(i);
718         const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
719         if(s!=NULL) {
720             text.append(*s);
721         } else if(recurse) /* must be a UXMLElement */ {
722             ((const UXMLElement *)node)->appendText(text, recurse);
723         }
724     }
725 }
726 
727 int32_t
countAttributes() const728 UXMLElement::countAttributes() const {
729     return fAttNames.size();
730 }
731 
732 const UnicodeString *
getAttribute(int32_t i,UnicodeString & name,UnicodeString & value) const733 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
734     if(0<=i && i<fAttNames.size()) {
735         name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
736         value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
737         return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
738     } else {
739         return NULL;
740     }
741 }
742 
743 const UnicodeString *
getAttribute(const UnicodeString & name) const744 UXMLElement::getAttribute(const UnicodeString &name) const {
745     // search for the attribute name by comparing the interned pointer,
746     // not the string contents
747     const UnicodeString *p=fParser->findName(name);
748     if(p==NULL) {
749         return NULL; // no such attribute seen by the parser at all
750     }
751 
752     int32_t i, count=fAttNames.size();
753     for(i=0; i<count; ++i) {
754         if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
755             return (const UnicodeString *)fAttValues.elementAt(i);
756         }
757     }
758     return NULL;
759 }
760 
761 int32_t
countChildren() const762 UXMLElement::countChildren() const {
763     return fChildren.size();
764 }
765 
766 const UObject *
getChild(int32_t i,UXMLNodeType & type) const767 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
768     if(0<=i && i<fChildren.size()) {
769         const UObject *node=(const UObject *)fChildren.elementAt(i);
770         if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
771             type=UXML_NODE_TYPE_ELEMENT;
772         } else {
773             type=UXML_NODE_TYPE_STRING;
774         }
775         return node;
776     } else {
777         return NULL;
778     }
779 }
780 
781 const UXMLElement *
nextChildElement(int32_t & i) const782 UXMLElement::nextChildElement(int32_t &i) const {
783     if(i<0) {
784         return NULL;
785     }
786 
787     const UObject *node;
788     int32_t count=fChildren.size();
789     while(i<count) {
790         node=(const UObject *)fChildren.elementAt(i++);
791         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
792         if(elem!=NULL) {
793             return elem;
794         }
795     }
796     return NULL;
797 }
798 
799 const UXMLElement *
getChildElement(const UnicodeString & name) const800 UXMLElement::getChildElement(const UnicodeString &name) const {
801     // search for the element name by comparing the interned pointer,
802     // not the string contents
803     const UnicodeString *p=fParser->findName(name);
804     if(p==NULL) {
805         return NULL; // no such element seen by the parser at all
806     }
807 
808     const UObject *node;
809     int32_t i, count=fChildren.size();
810     for(i=0; i<count; ++i) {
811         node=(const UObject *)fChildren.elementAt(i);
812         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
813         if(elem!=NULL) {
814             if(p==elem->fName) {
815                 return elem;
816             }
817         }
818     }
819     return NULL;
820 }
821 
822 U_NAMESPACE_END
823 
824 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
825 
826