• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2004-2006, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  xmlparser.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2004jul21
14 *   created by: Andy Heninger
15 */
16 
17 #include <stdio.h>
18 #include "unicode/uchar.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/regex.h"
21 #include "filestrm.h"
22 #include "xmlparser.h"
23 
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
25 
26 // character constants
27 enum {
28     x_QUOT=0x22,
29     x_AMP=0x26,
30     x_APOS=0x27,
31     x_LT=0x3c,
32     x_GT=0x3e,
33     x_l=0x6c
34 };
35 
36 #define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
37 
38 // XML #4
39 #define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
40                     "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
41                     "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
42                     "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
43 
44 //  XML #5
45 #define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
46 
47 //  XML #6
48 #define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
49 
50 U_NAMESPACE_BEGIN
51 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
54 
55 //
56 //   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
57 //                             used for parsing.
58 //
59 UXMLParser::UXMLParser(UErrorCode &status) :
60       //  XML Declaration.  XML Production #23.
61       //      example:  "<?xml version=1.0 encoding="utf-16" ?>
62       //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
63       //            allow for a possible leading BOM.
64       mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>"), 0, status),
65 
66       //  XML Comment   production #15
67       //     example:  "<!-- whatever -->
68       //       note, does not detect an illegal "--" within comments
69       mXMLComment(UnicodeString("(?s)<!--.+?-->"), 0, status),
70 
71       //  XML Spaces
72       //      production [3]
73       mXMLSP(UnicodeString(XML_SPACES "+"), 0, status),
74 
75       //  XML Doctype decl  production #28
76       //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
77       //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
78       //           Some internal dtd subsets could confuse this simple-minded
79       //           attempt at skipping over them.
80       mXMLDoctype(UnicodeString("(?s)<!DOCTYPE.+?>"), 0, status),
81 
82       //  XML PI     production #16
83       //     example   "<?target stuff?>
84       mXMLPI(UnicodeString("(?s)<\\?.+?\\?>"), 0, status),
85 
86       //  XML Element Start   Productions #40, #41
87       //          example   <foo att1='abc'  att2="d e f" >
88       //      capture #1:  the tag name
89       //
90       mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
91           "(?:"
92                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
93                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
94           ")*"                                                             //   * for zero or more attributes.
95           XML_SPACES "*?>"), 0, status),                               // match " >"
96 
97       //  XML Element End     production #42
98       //     example   </foo>
99       mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>"), 0, status),
100 
101       // XML Element Empty    production #44
102       //     example   <foo att1="abc"   att2="d e f" />
103       mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
104           "(?:"
105                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
106                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
107           ")*"                                                             //   * for zero or more attributes.
108           XML_SPACES "*?/>"), 0, status),                              // match " />"
109 
110 
111       // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
112       mXMLCharData(UnicodeString("(?s)[^<]*"), 0, status),
113 
114       // Attribute name = "value".  XML Productions 10, 40/41
115       //  Capture group 1 is name,
116       //                2 is the attribute value, including the quotes.
117       //
118       //   Note that attributes are scanned twice.  The first time is with
119       //        the regex for an entire element start.  There, the attributes
120       //        are checked syntactically, but not separted out one by one.
121       //        Here, we match a single attribute, and make its name and
122       //        attribute value available to the parser code.
123       mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
124          "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"), 0, status),
125 
126 
127       mAttrNormalizer(UnicodeString(XML_SPACES), 0, status),
128 
129       // Match any of the new-line sequences in content.
130       //   All are changed to \u000a.
131       mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028"), 0, status),
132 
133       // & char references
134       //   We will figure out what we've got based on which capture group has content.
135       //   The last one is a catchall for unrecognized entity references..
136       //             1     2     3      4      5           6                    7          8
137       mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
138                 0, status),
139 
140       fNames(status),
141       fElementStack(status),
142       fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
143       {
144       }
145 
146 UXMLParser *
createParser(UErrorCode & errorCode)147 UXMLParser::createParser(UErrorCode &errorCode) {
148     if (U_FAILURE(errorCode)) {
149         return NULL;
150     } else {
151         return new UXMLParser(errorCode);
152     }
153 }
154 
~UXMLParser()155 UXMLParser::~UXMLParser() {}
156 
157 UXMLElement *
parseFile(const char * filename,UErrorCode & errorCode)158 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
159     char bytes[4096], charsetBuffer[100];
160     FileStream *f;
161     const char *charset, *pb;
162     UnicodeString src;
163     UConverter *cnv;
164     UChar *buffer, *pu;
165     int32_t fileLength, bytesLength, length, capacity;
166     UBool flush;
167 
168     if(U_FAILURE(errorCode)) {
169         return NULL;
170     }
171 
172     f=T_FileStream_open(filename, "rb");
173     if(f==NULL) {
174         errorCode=U_FILE_ACCESS_ERROR;
175         return NULL;
176     }
177 
178     bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
179     if(bytesLength<(int32_t)sizeof(bytes)) {
180         // we have already read the entire file
181         fileLength=bytesLength;
182     } else {
183         // get the file length
184         fileLength=T_FileStream_size(f);
185     }
186 
187     /*
188      * get the charset:
189      * 1. Unicode signature
190      * 2. treat as ISO-8859-1 and read XML encoding="charser"
191      * 3. default to UTF-8
192      */
193     charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
194     if(U_SUCCESS(errorCode) && charset!=NULL) {
195         // open converter according to Unicode signature
196         cnv=ucnv_open(charset, &errorCode);
197     } else {
198         // read as Latin-1 and parse the XML declaration and encoding
199         cnv=ucnv_open("ISO-8859-1", &errorCode);
200         if(U_FAILURE(errorCode)) {
201             // unexpected error opening Latin-1 converter
202             goto exit;
203         }
204 
205         buffer=src.getBuffer(bytesLength);
206         if(buffer==NULL) {
207             // unexpected failure to reserve some string capacity
208             errorCode=U_MEMORY_ALLOCATION_ERROR;
209             goto exit;
210         }
211         pb=bytes;
212         pu=buffer;
213         ucnv_toUnicode(
214             cnv,
215             &pu, buffer+src.getCapacity(),
216             &pb, bytes+bytesLength,
217             NULL, TRUE, &errorCode);
218         src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
219         ucnv_close(cnv);
220         cnv=NULL;
221         if(U_FAILURE(errorCode)) {
222             // unexpected error in conversion from Latin-1
223             src.remove();
224             goto exit;
225         }
226 
227         // parse XML declaration
228         if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
229             int32_t declEnd=mXMLDecl.end(errorCode);
230             // go beyond <?xml
231             int32_t pos=src.indexOf((UChar)x_l)+1;
232 
233             mAttrValue.reset(src);
234             while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
235                 UnicodeString attName  = mAttrValue.group(1, errorCode);
236                 UnicodeString attValue = mAttrValue.group(2, errorCode);
237 
238                 // Trim the quotes from the att value.  These are left over from the original regex
239                 //   that parsed the attribue, which couldn't conveniently strip them.
240                 attValue.remove(0,1);                    // one char from the beginning
241                 attValue.truncate(attValue.length()-1);  // and one from the end.
242 
243                 if(attName==UNICODE_STRING("encoding", 8)) {
244                     length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
245                     charset=charsetBuffer;
246                     break;
247                 }
248                 pos = mAttrValue.end(2, errorCode);
249             }
250 
251             if(charset==NULL) {
252                 // default to UTF-8
253                 charset="UTF-8";
254             }
255             cnv=ucnv_open(charset, &errorCode);
256         }
257     }
258 
259     if(U_FAILURE(errorCode)) {
260         // unable to open the converter
261         goto exit;
262     }
263 
264     // convert the file contents
265     capacity=fileLength;        // estimated capacity
266     src.getBuffer(capacity);
267     src.releaseBuffer(0);       // zero length
268     flush=FALSE;
269     for(;;) {
270         // convert contents of bytes[bytesLength]
271         pb=bytes;
272         for(;;) {
273             length=src.length();
274             buffer=src.getBuffer(capacity);
275             if(buffer==NULL) {
276                 // unexpected failure to reserve some string capacity
277                 errorCode=U_MEMORY_ALLOCATION_ERROR;
278                 goto exit;
279             }
280 
281             pu=buffer+length;
282             ucnv_toUnicode(
283                 cnv, &pu, buffer+src.getCapacity(),
284                 &pb, bytes+bytesLength,
285                 NULL, FALSE, &errorCode);
286             src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
287             if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
288                 errorCode=U_ZERO_ERROR;
289                 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
290             } else {
291                 break;
292             }
293         }
294 
295         if(U_FAILURE(errorCode)) {
296             break; // conversion error
297         }
298 
299         if(flush) {
300             break; // completely converted the file
301         }
302 
303         // read next block
304         bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
305         if(bytesLength==0) {
306             // reached end of file, convert once more to flush the converter
307             flush=TRUE;
308         }
309     };
310 
311 exit:
312     ucnv_close(cnv);
313     T_FileStream_close(f);
314 
315     if(U_SUCCESS(errorCode)) {
316         return parse(src, errorCode);
317     } else {
318         return NULL;
319     }
320 }
321 
322 UXMLElement *
parse(const UnicodeString & src,UErrorCode & status)323 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
324     if(U_FAILURE(status)) {
325         return NULL;
326     }
327 
328     UXMLElement   *root = NULL;
329     fPos = 0; // TODO use just a local pos variable and pass it into functions
330               // where necessary?
331 
332     // set all matchers to work on the input string
333     mXMLDecl.reset(src);
334     mXMLComment.reset(src);
335     mXMLSP.reset(src);
336     mXMLDoctype.reset(src);
337     mXMLPI.reset(src);
338     mXMLElemStart.reset(src);
339     mXMLElemEnd.reset(src);
340     mXMLElemEmpty.reset(src);
341     mXMLCharData.reset(src);
342     mAttrValue.reset(src);
343     mAttrNormalizer.reset(src);
344     mNewLineNormalizer.reset(src);
345     mAmps.reset(src);
346 
347     // Consume the XML Declaration, if present.
348     if (mXMLDecl.lookingAt(fPos, status)) {
349         fPos = mXMLDecl.end(status);
350     }
351 
352     // Consume "misc" [XML production 27] appearing before DocType
353     parseMisc(status);
354 
355     // Consume a DocType declaration, if present.
356     if (mXMLDoctype.lookingAt(fPos, status)) {
357         fPos = mXMLDoctype.end(status);
358     }
359 
360     // Consume additional "misc" [XML production 27] appearing after the DocType
361     parseMisc(status);
362 
363     // Get the root element
364     if (mXMLElemEmpty.lookingAt(fPos, status)) {
365         // Root is an empty element (no nested elements or content)
366         root = createElement(mXMLElemEmpty, status);
367         fPos = mXMLElemEmpty.end(status);
368     } else {
369         if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
370             error("Root Element expected", status);
371             goto errorExit;
372         }
373         root = createElement(mXMLElemStart, status);
374         UXMLElement  *el = root;
375 
376         //
377         // This is the loop that consumes the root element of the document,
378         //      including all nested content.   Nested elements are handled by
379         //      explicit pushes/pops of the element stack; there is no recursion
380         //      in the control flow of this code.
381         //      "el" always refers to the current element, the one to which content
382         //      is being added.  It is above the top of the element stack.
383         for (;;) {
384             // Nested Element Start
385             if (mXMLElemStart.lookingAt(fPos, status)) {
386                 UXMLElement *t = createElement(mXMLElemStart, status);
387                 el->fChildren.addElement(t, status);
388                 t->fParent = el;
389                 fElementStack.push(el, status);
390                 el = t;
391                 continue;
392             }
393 
394             // Text Content.  String is concatenated onto the current node's content,
395             //                but only if it contains something other than spaces.
396             UnicodeString s = scanContent(status);
397             if (s.length() > 0) {
398                 mXMLSP.reset(s);
399                 if (mXMLSP.matches(status) == FALSE) {
400                     // This chunk of text contains something other than just
401                     //  white space. Make a child node for it.
402                     replaceCharRefs(s, status);
403                     el->fChildren.addElement(s.clone(), status);
404                 }
405                 mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
406                 continue;
407             }
408 
409             // Comments.  Discard.
410             if (mXMLComment.lookingAt(fPos, status)) {
411                 fPos = mXMLComment.end(status);
412                 continue;
413             }
414 
415             // PIs.  Discard.
416             if (mXMLPI.lookingAt(fPos, status)) {
417                 fPos = mXMLPI.end(status);
418                 continue;
419             }
420 
421             // Element End
422             if (mXMLElemEnd.lookingAt(fPos, status)) {
423                 fPos = mXMLElemEnd.end(0, status);
424                 const UnicodeString name = mXMLElemEnd.group(1, status);
425                 if (name != *el->fName) {
426                     error("Element start / end tag mismatch", status);
427                     goto errorExit;
428                 }
429                 if (fElementStack.empty()) {
430                     // Close of the root element.  We're done with the doc.
431                     el = NULL;
432                     break;
433                 }
434                 el = (UXMLElement *)fElementStack.pop();
435                 continue;
436             }
437 
438             // Empty Element.  Stored as a child of the current element, but not stacked.
439             if (mXMLElemEmpty.lookingAt(fPos, status)) {
440                 UXMLElement *t = createElement(mXMLElemEmpty, status);
441                 el->fChildren.addElement(t, status);
442                 continue;
443             }
444 
445             // Hit something within the document that doesn't match anything.
446             //   It's an error.
447             error("Unrecognized markup", status);
448             break;
449         }
450 
451         if (el != NULL || !fElementStack.empty()) {
452             // We bailed out early, for some reason.
453             error("Root element not closed.", status);
454             goto errorExit;
455         }
456     }
457 
458     // Root Element parse is complete.
459     // Consume the annoying xml "Misc" that can appear at the end of the doc.
460     parseMisc(status);
461 
462     // We should have reached the end of the input
463     if (fPos != src.length()) {
464         error("Extra content at the end of the document", status);
465         goto errorExit;
466     }
467 
468     // Success!
469     return root;
470 
471 errorExit:
472     delete root;
473     return NULL;
474 }
475 
476 //
477 //  createElement
478 //      We've just matched an element start tag.  Create and fill in a UXMLElement object
479 //      for it.
480 //
481 UXMLElement *
createElement(RegexMatcher & mEl,UErrorCode & status)482 UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
483     // First capture group is the element's name.
484     UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
485 
486     // Scan for attributes.
487     int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
488 
489     while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
490         UnicodeString attName  = mAttrValue.group(1, status);
491         UnicodeString attValue = mAttrValue.group(2, status);
492 
493         // Trim the quotes from the att value.  These are left over from the original regex
494         //   that parsed the attribue, which couldn't conveniently strip them.
495         attValue.remove(0,1);                    // one char from the beginning
496         attValue.truncate(attValue.length()-1);  // and one from the end.
497 
498         // XML Attribue value normalization.
499         // This is one of the really screwy parts of the XML spec.
500         // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
501         // Note that non-validating parsers must treat all entities as type CDATA
502         //   which simplifies things some.
503 
504         // Att normalization step 1:  normalize any newlines in the attribute value
505         mNewLineNormalizer.reset(attValue);
506         attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
507 
508         // Next change all xml white space chars to plain \u0020 spaces.
509         mAttrNormalizer.reset(attValue);
510         UnicodeString oneSpace((UChar)0x0020);
511         attValue = mAttrNormalizer.replaceAll(oneSpace, status);
512 
513         // Replace character entities.
514         replaceCharRefs(attValue, status);
515 
516         // Save the attribute name and value in our document structure.
517         el->fAttNames.addElement((void *)intern(attName, status), status);
518         el->fAttValues.addElement(attValue.clone(), status);
519         pos = mAttrValue.end(2, status);
520     }
521     fPos = mEl.end(0, status);
522     return el;
523 }
524 
525 //
526 //  parseMisc
527 //     Consume XML "Misc" [production #27]
528 //        which is any combination of space, PI and comments
529 //      Need to watch end-of-input because xml MISC stuff is allowed after
530 //        the document element, so we WILL scan off the end in this function
531 //
532 void
parseMisc(UErrorCode & status)533 UXMLParser::parseMisc(UErrorCode &status)  {
534     for (;;) {
535         if (fPos >= mXMLPI.input().length()) {
536             break;
537         }
538         if (mXMLPI.lookingAt(fPos, status)) {
539             fPos = mXMLPI.end(status);
540             continue;
541         }
542         if (mXMLSP.lookingAt(fPos, status)) {
543             fPos = mXMLSP.end(status);
544             continue;
545         }
546         if (mXMLComment.lookingAt(fPos, status)) {
547             fPos = mXMLComment.end(status);
548             continue;
549         }
550         break;
551     }
552 }
553 
554 //
555 //  Scan for document content.
556 //
557 UnicodeString
scanContent(UErrorCode & status)558 UXMLParser::scanContent(UErrorCode &status) {
559     UnicodeString  result;
560     if (mXMLCharData.lookingAt(fPos, status)) {
561         result = mXMLCharData.group(0, status);
562         // Normalize the new-lines.  (Before char ref substitution)
563         mNewLineNormalizer.reset(result);
564         result = mNewLineNormalizer.replaceAll(fOneLF, status);
565 
566         // TODO:  handle CDATA
567         fPos = mXMLCharData.end(0, status);
568     }
569 
570     return result;
571 }
572 
573 //
574 //   replaceCharRefs
575 //
576 //      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
577 //       with the corresponding actual character.
578 //
579 void
replaceCharRefs(UnicodeString & s,UErrorCode & status)580 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
581     UnicodeString result;
582     UnicodeString replacement;
583     int     i;
584 
585     mAmps.reset(s);
586     // See the initialization for the regex matcher mAmps.
587     //    Which entity we've matched is determined by which capture group has content,
588     //      which is flaged by start() of that group not being -1.
589     while (mAmps.find()) {
590         if (mAmps.start(1, status) != -1) {
591             replacement.setTo((UChar)x_AMP);
592         } else if (mAmps.start(2, status) != -1) {
593             replacement.setTo((UChar)x_LT);
594         } else if (mAmps.start(3, status) != -1) {
595             replacement.setTo((UChar)x_GT);
596         } else if (mAmps.start(4, status) != -1) {
597             replacement.setTo((UChar)x_APOS);
598         } else if (mAmps.start(5, status) != -1) {
599             replacement.setTo((UChar)x_QUOT);
600         } else if (mAmps.start(6, status) != -1) {
601             UnicodeString hexString = mAmps.group(6, status);
602             UChar32 val = 0;
603             for (i=0; i<hexString.length(); i++) {
604                 val = (val << 4) + u_digit(hexString.charAt(i), 16);
605             }
606             // TODO:  some verification that the character is valid
607             replacement.setTo(val);
608         } else if (mAmps.start(7, status) != -1) {
609             UnicodeString decimalString = mAmps.group(7, status);
610             UChar32 val = 0;
611             for (i=0; i<decimalString.length(); i++) {
612                 val = val*10 + u_digit(decimalString.charAt(i), 10);
613             }
614             // TODO:  some verification that the character is valid
615             replacement.setTo(val);
616         } else {
617             // An unrecognized &entity;  Leave it alone.
618             //  TODO:  check that it really looks like an entity, and is not some
619             //         random & in the text.
620             replacement = mAmps.group(0, status);
621         }
622         mAmps.appendReplacement(result, replacement, status);
623     }
624     mAmps.appendTail(result);
625     s = result;
626 }
627 
628 void
error(const char * message,UErrorCode & status)629 UXMLParser::error(const char *message, UErrorCode &status) {
630     // TODO:  something better here...
631     const UnicodeString &src=mXMLDecl.input();
632     int  line = 0;
633     int  ci = 0;
634     while (ci < fPos && ci>=0) {
635         ci = src.indexOf((UChar)0x0a, ci+1);
636         line++;
637     }
638     fprintf(stderr, "Error: %s at line %d\n", message, line);
639     if (U_SUCCESS(status)) {
640         status = U_PARSE_ERROR;
641     }
642 }
643 
644 // intern strings like in Java
645 
646 const UnicodeString *
intern(const UnicodeString & s,UErrorCode & errorCode)647 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
648     const UHashElement *he=fNames.find(s);
649     if(he!=NULL) {
650         // already a known name, return its hashed key pointer
651         return (const UnicodeString *)he->key.pointer;
652     } else {
653         // add this new name and return its hashed key pointer
654         fNames.puti(s, 0, errorCode);
655         he=fNames.find(s);
656         return (const UnicodeString *)he->key.pointer;
657     }
658 }
659 
660 const UnicodeString *
findName(const UnicodeString & s) const661 UXMLParser::findName(const UnicodeString &s) const {
662     const UHashElement *he=fNames.find(s);
663     if(he!=NULL) {
664         // a known name, return its hashed key pointer
665         return (const UnicodeString *)he->key.pointer;
666     } else {
667         // unknown name
668         return NULL;
669     }
670 }
671 
672 // UXMLElement ------------------------------------------------------------- ***
673 
UXMLElement(const UXMLParser * parser,const UnicodeString * name,UErrorCode & errorCode)674 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
675    fParser(parser),
676    fName(name),
677    fAttNames(errorCode),
678    fAttValues(errorCode),
679    fChildren(errorCode),
680    fParent(NULL)
681 {
682 }
683 
~UXMLElement()684 UXMLElement::~UXMLElement() {
685     int   i;
686     // attribute names are owned by the UXMLParser, don't delete them here
687     for (i=fAttValues.size()-1; i>=0; i--) {
688         delete (UObject *)fAttValues.elementAt(i);
689     }
690     for (i=fChildren.size()-1; i>=0; i--) {
691         delete (UObject *)fChildren.elementAt(i);
692     }
693 }
694 
695 const UnicodeString &
getTagName() const696 UXMLElement::getTagName() const {
697     return *fName;
698 }
699 
700 UnicodeString
getText(UBool recurse) const701 UXMLElement::getText(UBool recurse) const {
702     UnicodeString text;
703     appendText(text, recurse);
704     return text;
705 }
706 
707 void
appendText(UnicodeString & text,UBool recurse) const708 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
709     const UObject *node;
710     int32_t i, count=fChildren.size();
711     for(i=0; i<count; ++i) {
712         node=(const UObject *)fChildren.elementAt(i);
713         if(node->getDynamicClassID()==UnicodeString::getStaticClassID()) {
714             text.append(*(const UnicodeString *)node);
715         } else if(recurse) /* must be a UXMLElement */ {
716             ((const UXMLElement *)node)->appendText(text, recurse);
717         }
718     }
719 }
720 
721 int32_t
countAttributes() const722 UXMLElement::countAttributes() const {
723     return fAttNames.size();
724 }
725 
726 const UnicodeString *
getAttribute(int32_t i,UnicodeString & name,UnicodeString & value) const727 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
728     if(0<=i && i<fAttNames.size()) {
729         name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
730         value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
731         return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
732     } else {
733         return NULL;
734     }
735 }
736 
737 const UnicodeString *
getAttribute(const UnicodeString & name) const738 UXMLElement::getAttribute(const UnicodeString &name) const {
739     // search for the attribute name by comparing the interned pointer,
740     // not the string contents
741     const UnicodeString *p=fParser->findName(name);
742     if(p==NULL) {
743         return NULL; // no such attribute seen by the parser at all
744     }
745 
746     int32_t i, count=fAttNames.size();
747     for(i=0; i<count; ++i) {
748         if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
749             return (const UnicodeString *)fAttValues.elementAt(i);
750         }
751     }
752     return NULL;
753 }
754 
755 int32_t
countChildren() const756 UXMLElement::countChildren() const {
757     return fChildren.size();
758 }
759 
760 const UObject *
getChild(int32_t i,UXMLNodeType & type) const761 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
762     if(0<=i && i<fChildren.size()) {
763         const UObject *node=(const UObject *)fChildren.elementAt(i);
764         if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
765             type=UXML_NODE_TYPE_ELEMENT;
766         } else {
767             type=UXML_NODE_TYPE_STRING;
768         }
769         return node;
770     } else {
771         return NULL;
772     }
773 }
774 
775 const UXMLElement *
nextChildElement(int32_t & i) const776 UXMLElement::nextChildElement(int32_t &i) const {
777     if(i<0) {
778         return NULL;
779     }
780 
781     const UObject *node;
782     int32_t count=fChildren.size();
783     while(i<count) {
784         node=(const UObject *)fChildren.elementAt(i++);
785         // TODO: see if ICU can use C++ instanceof instead of its own poor man's RTTI
786         // if(node instanceof UXMLElement) {
787         if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
788             return (const UXMLElement *)node;
789         }
790     }
791     return NULL;
792 }
793 
794 const UXMLElement *
getChildElement(const UnicodeString & name) const795 UXMLElement::getChildElement(const UnicodeString &name) const {
796     // search for the element name by comparing the interned pointer,
797     // not the string contents
798     const UnicodeString *p=fParser->findName(name);
799     if(p==NULL) {
800         return NULL; // no such element seen by the parser at all
801     }
802 
803     const UObject *node;
804     int32_t i, count=fChildren.size();
805     for(i=0; i<count; ++i) {
806         node=(const UObject *)fChildren.elementAt(i);
807         if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
808             const UXMLElement *elem=(const UXMLElement *)node;
809             if(p==elem->fName) {
810                 return elem;
811             }
812         }
813     }
814     return NULL;
815 }
816 
817 U_NAMESPACE_END
818 
819 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
820 
821