• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2004-2010, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  xmlparser.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2004jul21
16 *   created by: Andy Heninger
17 */
18 
19 #include <stdio.h>
20 #include "unicode/uchar.h"
21 #include "unicode/ucnv.h"
22 #include "unicode/regex.h"
23 #include "filestrm.h"
24 #include "xmlparser.h"
25 
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
27 
28 // character constants
29 enum {
30     x_QUOT=0x22,
31     x_AMP=0x26,
32     x_APOS=0x27,
33     x_LT=0x3c,
34     x_GT=0x3e,
35     x_l=0x6c
36 };
37 
38 #define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
39 
40 // XML #4
41 #define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
42                     "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
43                     "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
44                     "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
45 
46 //  XML #5
47 #define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
48 
49 //  XML #6
50 #define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
51 
52 U_NAMESPACE_BEGIN
53 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)54 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
55 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
56 
57 //
58 //   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
59 //                             used for parsing.
60 //
61 UXMLParser::UXMLParser(UErrorCode &status) :
62       //  XML Declaration.  XML Production #23.
63       //      example:  "<?xml version=1.0 encoding="utf-16" ?>
64       //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
65       //            allow for a possible leading BOM.
66       mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
67 
68       //  XML Comment   production #15
69       //     example:  "<!-- whatever -->
70       //       note, does not detect an illegal "--" within comments
71       mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
72 
73       //  XML Spaces
74       //      production [3]
75       mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
76 
77       //  XML Doctype decl  production #28
78       //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
79       //       or      "<!DOCTYPE foo [internal dtd]>
80       //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
81       //           Some internal dtd subsets could confuse this simple-minded
82       //           attempt at skipping over them, specifically, occcurences
83       //           of closeing square brackets.  These could appear in comments,
84       //           or in parameter entity declarations, for example.
85       mXMLDoctype(UnicodeString(
86            "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
87            ), 0, status),
88 
89       //  XML PI     production #16
90       //     example   "<?target stuff?>
91       mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
92 
93       //  XML Element Start   Productions #40, #41
94       //          example   <foo att1='abc'  att2="d e f" >
95       //      capture #1:  the tag name
96       //
97       mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
98           "(?:"
99                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
100                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
101           ")*"                                                             //   * for zero or more attributes.
102           XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
103 
104       //  XML Element End     production #42
105       //     example   </foo>
106       mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
107 
108       // XML Element Empty    production #44
109       //     example   <foo att1="abc"   att2="d e f" />
110       mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
111           "(?:"
112                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
113                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
114           ")*"                                                             //   * for zero or more attributes.
115           XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
116 
117 
118       // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
119       mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
120 
121       // Attribute name = "value".  XML Productions 10, 40/41
122       //  Capture group 1 is name,
123       //                2 is the attribute value, including the quotes.
124       //
125       //   Note that attributes are scanned twice.  The first time is with
126       //        the regex for an entire element start.  There, the attributes
127       //        are checked syntactically, but not separted out one by one.
128       //        Here, we match a single attribute, and make its name and
129       //        attribute value available to the parser code.
130       mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
131          "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
132 
133 
134       mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
135 
136       // Match any of the new-line sequences in content.
137       //   All are changed to \u000a.
138       mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
139 
140       // & char references
141       //   We will figure out what we've got based on which capture group has content.
142       //   The last one is a catchall for unrecognized entity references..
143       //             1     2     3      4      5           6                    7          8
144       mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
145                 0, status),
146 
147       fNames(status),
148       fElementStack(status),
149       fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
150       {
151       }
152 
153 UXMLParser *
createParser(UErrorCode & errorCode)154 UXMLParser::createParser(UErrorCode &errorCode) {
155     if (U_FAILURE(errorCode)) {
156         return NULL;
157     } else {
158         return new UXMLParser(errorCode);
159     }
160 }
161 
~UXMLParser()162 UXMLParser::~UXMLParser() {}
163 
164 UXMLElement *
parseFile(const char * filename,UErrorCode & errorCode)165 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
166     char bytes[4096], charsetBuffer[100];
167     FileStream *f;
168     const char *charset, *pb;
169     UnicodeString src;
170     UConverter *cnv;
171     UChar *buffer, *pu;
172     int32_t fileLength, bytesLength, length, capacity;
173     UBool flush;
174 
175     if(U_FAILURE(errorCode)) {
176         return NULL;
177     }
178 
179     f=T_FileStream_open(filename, "rb");
180     if(f==NULL) {
181         errorCode=U_FILE_ACCESS_ERROR;
182         return NULL;
183     }
184 
185     bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
186     if(bytesLength<(int32_t)sizeof(bytes)) {
187         // we have already read the entire file
188         fileLength=bytesLength;
189     } else {
190         // get the file length
191         fileLength=T_FileStream_size(f);
192     }
193 
194     /*
195      * get the charset:
196      * 1. Unicode signature
197      * 2. treat as ISO-8859-1 and read XML encoding="charser"
198      * 3. default to UTF-8
199      */
200     charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
201     if(U_SUCCESS(errorCode) && charset!=NULL) {
202         // open converter according to Unicode signature
203         cnv=ucnv_open(charset, &errorCode);
204     } else {
205         // read as Latin-1 and parse the XML declaration and encoding
206         cnv=ucnv_open("ISO-8859-1", &errorCode);
207         if(U_FAILURE(errorCode)) {
208             // unexpected error opening Latin-1 converter
209             goto exit;
210         }
211 
212         buffer=toUCharPtr(src.getBuffer(bytesLength));
213         if(buffer==NULL) {
214             // unexpected failure to reserve some string capacity
215             errorCode=U_MEMORY_ALLOCATION_ERROR;
216             goto exit;
217         }
218         pb=bytes;
219         pu=buffer;
220         ucnv_toUnicode(
221             cnv,
222             &pu, buffer+src.getCapacity(),
223             &pb, bytes+bytesLength,
224             NULL, TRUE, &errorCode);
225         src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
226         ucnv_close(cnv);
227         cnv=NULL;
228         if(U_FAILURE(errorCode)) {
229             // unexpected error in conversion from Latin-1
230             src.remove();
231             goto exit;
232         }
233 
234         // parse XML declaration
235         if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
236             int32_t declEnd=mXMLDecl.end(errorCode);
237             // go beyond <?xml
238             int32_t pos=src.indexOf((UChar)x_l)+1;
239 
240             mAttrValue.reset(src);
241             while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
242                 UnicodeString attName  = mAttrValue.group(1, errorCode);
243                 UnicodeString attValue = mAttrValue.group(2, errorCode);
244 
245                 // Trim the quotes from the att value.  These are left over from the original regex
246                 //   that parsed the attribue, which couldn't conveniently strip them.
247                 attValue.remove(0,1);                    // one char from the beginning
248                 attValue.truncate(attValue.length()-1);  // and one from the end.
249 
250                 if(attName==UNICODE_STRING("encoding", 8)) {
251                     length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
252                     charset=charsetBuffer;
253                     break;
254                 }
255                 pos = mAttrValue.end(2, errorCode);
256             }
257 
258             if(charset==NULL) {
259                 // default to UTF-8
260                 charset="UTF-8";
261             }
262             cnv=ucnv_open(charset, &errorCode);
263         }
264     }
265 
266     if(U_FAILURE(errorCode)) {
267         // unable to open the converter
268         goto exit;
269     }
270 
271     // convert the file contents
272     capacity=fileLength;        // estimated capacity
273     src.getBuffer(capacity);
274     src.releaseBuffer(0);       // zero length
275     flush=FALSE;
276     for(;;) {
277         // convert contents of bytes[bytesLength]
278         pb=bytes;
279         for(;;) {
280             length=src.length();
281             buffer=toUCharPtr(src.getBuffer(capacity));
282             if(buffer==NULL) {
283                 // unexpected failure to reserve some string capacity
284                 errorCode=U_MEMORY_ALLOCATION_ERROR;
285                 goto exit;
286             }
287 
288             pu=buffer+length;
289             ucnv_toUnicode(
290                 cnv, &pu, buffer+src.getCapacity(),
291                 &pb, bytes+bytesLength,
292                 NULL, FALSE, &errorCode);
293             src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
294             if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
295                 errorCode=U_ZERO_ERROR;
296                 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
297             } else {
298                 break;
299             }
300         }
301 
302         if(U_FAILURE(errorCode)) {
303             break; // conversion error
304         }
305 
306         if(flush) {
307             break; // completely converted the file
308         }
309 
310         // read next block
311         bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
312         if(bytesLength==0) {
313             // reached end of file, convert once more to flush the converter
314             flush=TRUE;
315         }
316     };
317 
318 exit:
319     ucnv_close(cnv);
320     T_FileStream_close(f);
321 
322     if(U_SUCCESS(errorCode)) {
323         return parse(src, errorCode);
324     } else {
325         return NULL;
326     }
327 }
328 
329 UXMLElement *
parse(const UnicodeString & src,UErrorCode & status)330 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
331     if(U_FAILURE(status)) {
332         return NULL;
333     }
334 
335     UXMLElement   *root = NULL;
336     fPos = 0; // TODO use just a local pos variable and pass it into functions
337               // where necessary?
338 
339     // set all matchers to work on the input string
340     mXMLDecl.reset(src);
341     mXMLComment.reset(src);
342     mXMLSP.reset(src);
343     mXMLDoctype.reset(src);
344     mXMLPI.reset(src);
345     mXMLElemStart.reset(src);
346     mXMLElemEnd.reset(src);
347     mXMLElemEmpty.reset(src);
348     mXMLCharData.reset(src);
349     mAttrValue.reset(src);
350     mAttrNormalizer.reset(src);
351     mNewLineNormalizer.reset(src);
352     mAmps.reset(src);
353 
354     // Consume the XML Declaration, if present.
355     if (mXMLDecl.lookingAt(fPos, status)) {
356         fPos = mXMLDecl.end(status);
357     }
358 
359     // Consume "misc" [XML production 27] appearing before DocType
360     parseMisc(status);
361 
362     // Consume a DocType declaration, if present.
363     if (mXMLDoctype.lookingAt(fPos, status)) {
364         fPos = mXMLDoctype.end(status);
365     }
366 
367     // Consume additional "misc" [XML production 27] appearing after the DocType
368     parseMisc(status);
369 
370     // Get the root element
371     if (mXMLElemEmpty.lookingAt(fPos, status)) {
372         // Root is an empty element (no nested elements or content)
373         root = createElement(mXMLElemEmpty, status);
374         fPos = mXMLElemEmpty.end(status);
375     } else {
376         if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
377             error("Root Element expected", status);
378             goto errorExit;
379         }
380         root = createElement(mXMLElemStart, status);
381         UXMLElement  *el = root;
382 
383         //
384         // This is the loop that consumes the root element of the document,
385         //      including all nested content.   Nested elements are handled by
386         //      explicit pushes/pops of the element stack; there is no recursion
387         //      in the control flow of this code.
388         //      "el" always refers to the current element, the one to which content
389         //      is being added.  It is above the top of the element stack.
390         for (;;) {
391             // Nested Element Start
392             if (mXMLElemStart.lookingAt(fPos, status)) {
393                 UXMLElement *t = createElement(mXMLElemStart, status);
394                 el->fChildren.addElement(t, status);
395                 t->fParent = el;
396                 fElementStack.push(el, status);
397                 el = t;
398                 continue;
399             }
400 
401             // Text Content.  String is concatenated onto the current node's content,
402             //                but only if it contains something other than spaces.
403             UnicodeString s = scanContent(status);
404             if (s.length() > 0) {
405                 mXMLSP.reset(s);
406                 if (mXMLSP.matches(status) == FALSE) {
407                     // This chunk of text contains something other than just
408                     //  white space. Make a child node for it.
409                     replaceCharRefs(s, status);
410                     el->fChildren.addElement(s.clone(), status);
411                 }
412                 mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
413                 continue;
414             }
415 
416             // Comments.  Discard.
417             if (mXMLComment.lookingAt(fPos, status)) {
418                 fPos = mXMLComment.end(status);
419                 continue;
420             }
421 
422             // PIs.  Discard.
423             if (mXMLPI.lookingAt(fPos, status)) {
424                 fPos = mXMLPI.end(status);
425                 continue;
426             }
427 
428             // Element End
429             if (mXMLElemEnd.lookingAt(fPos, status)) {
430                 fPos = mXMLElemEnd.end(0, status);
431                 const UnicodeString name = mXMLElemEnd.group(1, status);
432                 if (name != *el->fName) {
433                     error("Element start / end tag mismatch", status);
434                     goto errorExit;
435                 }
436                 if (fElementStack.empty()) {
437                     // Close of the root element.  We're done with the doc.
438                     el = NULL;
439                     break;
440                 }
441                 el = (UXMLElement *)fElementStack.pop();
442                 continue;
443             }
444 
445             // Empty Element.  Stored as a child of the current element, but not stacked.
446             if (mXMLElemEmpty.lookingAt(fPos, status)) {
447                 UXMLElement *t = createElement(mXMLElemEmpty, status);
448                 el->fChildren.addElement(t, status);
449                 continue;
450             }
451 
452             // Hit something within the document that doesn't match anything.
453             //   It's an error.
454             error("Unrecognized markup", status);
455             break;
456         }
457 
458         if (el != NULL || !fElementStack.empty()) {
459             // We bailed out early, for some reason.
460             error("Root element not closed.", status);
461             goto errorExit;
462         }
463     }
464 
465     // Root Element parse is complete.
466     // Consume the annoying xml "Misc" that can appear at the end of the doc.
467     parseMisc(status);
468 
469     // We should have reached the end of the input
470     if (fPos != src.length()) {
471         error("Extra content at the end of the document", status);
472         goto errorExit;
473     }
474 
475     // Success!
476     return root;
477 
478 errorExit:
479     delete root;
480     return NULL;
481 }
482 
483 //
484 //  createElement
485 //      We've just matched an element start tag.  Create and fill in a UXMLElement object
486 //      for it.
487 //
488 UXMLElement *
createElement(RegexMatcher & mEl,UErrorCode & status)489 UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
490     // First capture group is the element's name.
491     UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
492 
493     // Scan for attributes.
494     int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
495 
496     while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
497         UnicodeString attName  = mAttrValue.group(1, status);
498         UnicodeString attValue = mAttrValue.group(2, status);
499 
500         // Trim the quotes from the att value.  These are left over from the original regex
501         //   that parsed the attribue, which couldn't conveniently strip them.
502         attValue.remove(0,1);                    // one char from the beginning
503         attValue.truncate(attValue.length()-1);  // and one from the end.
504 
505         // XML Attribue value normalization.
506         // This is one of the really screwy parts of the XML spec.
507         // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
508         // Note that non-validating parsers must treat all entities as type CDATA
509         //   which simplifies things some.
510 
511         // Att normalization step 1:  normalize any newlines in the attribute value
512         mNewLineNormalizer.reset(attValue);
513         attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
514 
515         // Next change all xml white space chars to plain \u0020 spaces.
516         mAttrNormalizer.reset(attValue);
517         UnicodeString oneSpace((UChar)0x0020);
518         attValue = mAttrNormalizer.replaceAll(oneSpace, status);
519 
520         // Replace character entities.
521         replaceCharRefs(attValue, status);
522 
523         // Save the attribute name and value in our document structure.
524         el->fAttNames.addElement((void *)intern(attName, status), status);
525         el->fAttValues.addElement(attValue.clone(), status);
526         pos = mAttrValue.end(2, status);
527     }
528     fPos = mEl.end(0, status);
529     return el;
530 }
531 
532 //
533 //  parseMisc
534 //     Consume XML "Misc" [production #27]
535 //        which is any combination of space, PI and comments
536 //      Need to watch end-of-input because xml MISC stuff is allowed after
537 //        the document element, so we WILL scan off the end in this function
538 //
539 void
parseMisc(UErrorCode & status)540 UXMLParser::parseMisc(UErrorCode &status)  {
541     for (;;) {
542         if (fPos >= mXMLPI.input().length()) {
543             break;
544         }
545         if (mXMLPI.lookingAt(fPos, status)) {
546             fPos = mXMLPI.end(status);
547             continue;
548         }
549         if (mXMLSP.lookingAt(fPos, status)) {
550             fPos = mXMLSP.end(status);
551             continue;
552         }
553         if (mXMLComment.lookingAt(fPos, status)) {
554             fPos = mXMLComment.end(status);
555             continue;
556         }
557         break;
558     }
559 }
560 
561 //
562 //  Scan for document content.
563 //
564 UnicodeString
scanContent(UErrorCode & status)565 UXMLParser::scanContent(UErrorCode &status) {
566     UnicodeString  result;
567     if (mXMLCharData.lookingAt(fPos, status)) {
568         result = mXMLCharData.group((int32_t)0, status);
569         // Normalize the new-lines.  (Before char ref substitution)
570         mNewLineNormalizer.reset(result);
571         result = mNewLineNormalizer.replaceAll(fOneLF, status);
572 
573         // TODO:  handle CDATA
574         fPos = mXMLCharData.end(0, status);
575     }
576 
577     return result;
578 }
579 
580 //
581 //   replaceCharRefs
582 //
583 //      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
584 //       with the corresponding actual character.
585 //
586 void
replaceCharRefs(UnicodeString & s,UErrorCode & status)587 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
588     UnicodeString result;
589     UnicodeString replacement;
590     int     i;
591 
592     mAmps.reset(s);
593     // See the initialization for the regex matcher mAmps.
594     //    Which entity we've matched is determined by which capture group has content,
595     //      which is flaged by start() of that group not being -1.
596     while (mAmps.find()) {
597         if (mAmps.start(1, status) != -1) {
598             replacement.setTo((UChar)x_AMP);
599         } else if (mAmps.start(2, status) != -1) {
600             replacement.setTo((UChar)x_LT);
601         } else if (mAmps.start(3, status) != -1) {
602             replacement.setTo((UChar)x_GT);
603         } else if (mAmps.start(4, status) != -1) {
604             replacement.setTo((UChar)x_APOS);
605         } else if (mAmps.start(5, status) != -1) {
606             replacement.setTo((UChar)x_QUOT);
607         } else if (mAmps.start(6, status) != -1) {
608             UnicodeString hexString = mAmps.group(6, status);
609             UChar32 val = 0;
610             for (i=0; i<hexString.length(); i++) {
611                 val = (val << 4) + u_digit(hexString.charAt(i), 16);
612             }
613             // TODO:  some verification that the character is valid
614             replacement.setTo(val);
615         } else if (mAmps.start(7, status) != -1) {
616             UnicodeString decimalString = mAmps.group(7, status);
617             UChar32 val = 0;
618             for (i=0; i<decimalString.length(); i++) {
619                 val = val*10 + u_digit(decimalString.charAt(i), 10);
620             }
621             // TODO:  some verification that the character is valid
622             replacement.setTo(val);
623         } else {
624             // An unrecognized &entity;  Leave it alone.
625             //  TODO:  check that it really looks like an entity, and is not some
626             //         random & in the text.
627             replacement = mAmps.group((int32_t)0, status);
628         }
629         mAmps.appendReplacement(result, replacement, status);
630     }
631     mAmps.appendTail(result);
632     s = result;
633 }
634 
635 void
error(const char * message,UErrorCode & status)636 UXMLParser::error(const char *message, UErrorCode &status) {
637     // TODO:  something better here...
638     const UnicodeString &src=mXMLDecl.input();
639     int  line = 0;
640     int  ci = 0;
641     while (ci < fPos && ci>=0) {
642         ci = src.indexOf((UChar)0x0a, ci+1);
643         line++;
644     }
645     fprintf(stderr, "Error: %s at line %d\n", message, line);
646     if (U_SUCCESS(status)) {
647         status = U_PARSE_ERROR;
648     }
649 }
650 
651 // intern strings like in Java
652 
653 const UnicodeString *
intern(const UnicodeString & s,UErrorCode & errorCode)654 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
655     const UHashElement *he=fNames.find(s);
656     if(he!=NULL) {
657         // already a known name, return its hashed key pointer
658         return (const UnicodeString *)he->key.pointer;
659     } else {
660         // add this new name and return its hashed key pointer
661         fNames.puti(s, 0, errorCode);
662         he=fNames.find(s);
663         return (const UnicodeString *)he->key.pointer;
664     }
665 }
666 
667 const UnicodeString *
findName(const UnicodeString & s) const668 UXMLParser::findName(const UnicodeString &s) const {
669     const UHashElement *he=fNames.find(s);
670     if(he!=NULL) {
671         // a known name, return its hashed key pointer
672         return (const UnicodeString *)he->key.pointer;
673     } else {
674         // unknown name
675         return NULL;
676     }
677 }
678 
679 // UXMLElement ------------------------------------------------------------- ***
680 
UXMLElement(const UXMLParser * parser,const UnicodeString * name,UErrorCode & errorCode)681 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
682    fParser(parser),
683    fName(name),
684    fAttNames(errorCode),
685    fAttValues(errorCode),
686    fChildren(errorCode),
687    fParent(NULL)
688 {
689 }
690 
~UXMLElement()691 UXMLElement::~UXMLElement() {
692     int   i;
693     // attribute names are owned by the UXMLParser, don't delete them here
694     for (i=fAttValues.size()-1; i>=0; i--) {
695         delete (UObject *)fAttValues.elementAt(i);
696     }
697     for (i=fChildren.size()-1; i>=0; i--) {
698         delete (UObject *)fChildren.elementAt(i);
699     }
700 }
701 
702 const UnicodeString &
getTagName() const703 UXMLElement::getTagName() const {
704     return *fName;
705 }
706 
707 UnicodeString
getText(UBool recurse) const708 UXMLElement::getText(UBool recurse) const {
709     UnicodeString text;
710     appendText(text, recurse);
711     return text;
712 }
713 
714 void
appendText(UnicodeString & text,UBool recurse) const715 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
716     const UObject *node;
717     int32_t i, count=fChildren.size();
718     for(i=0; i<count; ++i) {
719         node=(const UObject *)fChildren.elementAt(i);
720         const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
721         if(s!=NULL) {
722             text.append(*s);
723         } else if(recurse) /* must be a UXMLElement */ {
724             ((const UXMLElement *)node)->appendText(text, recurse);
725         }
726     }
727 }
728 
729 int32_t
countAttributes() const730 UXMLElement::countAttributes() const {
731     return fAttNames.size();
732 }
733 
734 const UnicodeString *
getAttribute(int32_t i,UnicodeString & name,UnicodeString & value) const735 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
736     if(0<=i && i<fAttNames.size()) {
737         name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
738         value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
739         return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
740     } else {
741         return NULL;
742     }
743 }
744 
745 const UnicodeString *
getAttribute(const UnicodeString & name) const746 UXMLElement::getAttribute(const UnicodeString &name) const {
747     // search for the attribute name by comparing the interned pointer,
748     // not the string contents
749     const UnicodeString *p=fParser->findName(name);
750     if(p==NULL) {
751         return NULL; // no such attribute seen by the parser at all
752     }
753 
754     int32_t i, count=fAttNames.size();
755     for(i=0; i<count; ++i) {
756         if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
757             return (const UnicodeString *)fAttValues.elementAt(i);
758         }
759     }
760     return NULL;
761 }
762 
763 int32_t
countChildren() const764 UXMLElement::countChildren() const {
765     return fChildren.size();
766 }
767 
768 const UObject *
getChild(int32_t i,UXMLNodeType & type) const769 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
770     if(0<=i && i<fChildren.size()) {
771         const UObject *node=(const UObject *)fChildren.elementAt(i);
772         if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
773             type=UXML_NODE_TYPE_ELEMENT;
774         } else {
775             type=UXML_NODE_TYPE_STRING;
776         }
777         return node;
778     } else {
779         return NULL;
780     }
781 }
782 
783 const UXMLElement *
nextChildElement(int32_t & i) const784 UXMLElement::nextChildElement(int32_t &i) const {
785     if(i<0) {
786         return NULL;
787     }
788 
789     const UObject *node;
790     int32_t count=fChildren.size();
791     while(i<count) {
792         node=(const UObject *)fChildren.elementAt(i++);
793         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
794         if(elem!=NULL) {
795             return elem;
796         }
797     }
798     return NULL;
799 }
800 
801 const UXMLElement *
getChildElement(const UnicodeString & name) const802 UXMLElement::getChildElement(const UnicodeString &name) const {
803     // search for the element name by comparing the interned pointer,
804     // not the string contents
805     const UnicodeString *p=fParser->findName(name);
806     if(p==NULL) {
807         return NULL; // no such element seen by the parser at all
808     }
809 
810     const UObject *node;
811     int32_t i, count=fChildren.size();
812     for(i=0; i<count; ++i) {
813         node=(const UObject *)fChildren.elementAt(i);
814         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
815         if(elem!=NULL) {
816             if(p==elem->fName) {
817                 return elem;
818             }
819         }
820     }
821     return NULL;
822 }
823 
824 U_NAMESPACE_END
825 
826 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
827 
828