1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2004-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: xmlparser.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2004jul21
14 * created by: Andy Heninger
15 */
16
17 #include <stdio.h>
18 #include "unicode/uchar.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/regex.h"
21 #include "filestrm.h"
22 #include "xmlparser.h"
23
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
25
26 // character constants
27 enum {
28 x_QUOT=0x22,
29 x_AMP=0x26,
30 x_APOS=0x27,
31 x_LT=0x3c,
32 x_GT=0x3e,
33 x_l=0x6c
34 };
35
36 #define XML_SPACES "[ \\u0009\\u000d\\u000a]"
37
38 // XML #4
39 #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
40 "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
41 "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
42 "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
43
44 // XML #5
45 #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
46
47 // XML #6
48 #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
49
50 U_NAMESPACE_BEGIN
51
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
54
55 //
56 // UXMLParser constructor. Mostly just initializes the ICU regexes that are
57 // used for parsing.
58 //
59 UXMLParser::UXMLParser(UErrorCode &status) :
60 // XML Declaration. XML Production #23.
61 // example: "<?xml version=1.0 encoding="utf-16" ?>
62 // This is a sloppy implementation - just look for the leading <?xml and the closing ?>
63 // allow for a possible leading BOM.
64 mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
65
66 // XML Comment production #15
67 // example: "<!-- whatever -->
68 // note, does not detect an illegal "--" within comments
69 mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
70
71 // XML Spaces
72 // production [3]
73 mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
74
75 // XML Doctype decl production #28
76 // example "<!DOCTYPE foo SYSTEM "somewhere" >
77 // or "<!DOCTYPE foo [internal dtd]>
78 // TODO: we don't actually parse the DOCTYPE or internal subsets.
79 // Some internal dtd subsets could confuse this simple-minded
80 // attempt at skipping over them, specifically, occcurences
81 // of closeing square brackets. These could appear in comments,
82 // or in parameter entity declarations, for example.
83 mXMLDoctype(UnicodeString(
84 "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
85 ), 0, status),
86
87 // XML PI production #16
88 // example "<?target stuff?>
89 mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
90
91 // XML Element Start Productions #40, #41
92 // example <foo att1='abc' att2="d e f" >
93 // capture #1: the tag name
94 //
95 mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
96 "(?:"
97 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
98 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
99 ")*" // * for zero or more attributes.
100 XML_SPACES "*?>", -1, US_INV), 0, status), // match " >"
101
102 // XML Element End production #42
103 // example </foo>
104 mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
105
106 // XML Element Empty production #44
107 // example <foo att1="abc" att2="d e f" />
108 mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
109 "(?:"
110 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
111 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
112 ")*" // * for zero or more attributes.
113 XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />"
114
115
116 // XMLCharData. Everything but '<'. Note that & will be dealt with later.
117 mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
118
119 // Attribute name = "value". XML Productions 10, 40/41
120 // Capture group 1 is name,
121 // 2 is the attribute value, including the quotes.
122 //
123 // Note that attributes are scanned twice. The first time is with
124 // the regex for an entire element start. There, the attributes
125 // are checked syntactically, but not separted out one by one.
126 // Here, we match a single attribute, and make its name and
127 // attribute value available to the parser code.
128 mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*"
129 "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
130
131
132 mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
133
134 // Match any of the new-line sequences in content.
135 // All are changed to \u000a.
136 mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
137
138 // & char references
139 // We will figure out what we've got based on which capture group has content.
140 // The last one is a catchall for unrecognized entity references..
141 // 1 2 3 4 5 6 7 8
142 mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
143 0, status),
144
145 fNames(status),
146 fElementStack(status),
147 fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization.
148 {
149 }
150
151 UXMLParser *
createParser(UErrorCode & errorCode)152 UXMLParser::createParser(UErrorCode &errorCode) {
153 if (U_FAILURE(errorCode)) {
154 return NULL;
155 } else {
156 return new UXMLParser(errorCode);
157 }
158 }
159
~UXMLParser()160 UXMLParser::~UXMLParser() {}
161
162 UXMLElement *
parseFile(const char * filename,UErrorCode & errorCode)163 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
164 char bytes[4096], charsetBuffer[100];
165 FileStream *f;
166 const char *charset, *pb;
167 UnicodeString src;
168 UConverter *cnv;
169 UChar *buffer, *pu;
170 int32_t fileLength, bytesLength, length, capacity;
171 UBool flush;
172
173 if(U_FAILURE(errorCode)) {
174 return NULL;
175 }
176
177 f=T_FileStream_open(filename, "rb");
178 if(f==NULL) {
179 errorCode=U_FILE_ACCESS_ERROR;
180 return NULL;
181 }
182
183 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
184 if(bytesLength<(int32_t)sizeof(bytes)) {
185 // we have already read the entire file
186 fileLength=bytesLength;
187 } else {
188 // get the file length
189 fileLength=T_FileStream_size(f);
190 }
191
192 /*
193 * get the charset:
194 * 1. Unicode signature
195 * 2. treat as ISO-8859-1 and read XML encoding="charser"
196 * 3. default to UTF-8
197 */
198 charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
199 if(U_SUCCESS(errorCode) && charset!=NULL) {
200 // open converter according to Unicode signature
201 cnv=ucnv_open(charset, &errorCode);
202 } else {
203 // read as Latin-1 and parse the XML declaration and encoding
204 cnv=ucnv_open("ISO-8859-1", &errorCode);
205 if(U_FAILURE(errorCode)) {
206 // unexpected error opening Latin-1 converter
207 goto exit;
208 }
209
210 buffer=src.getBuffer(bytesLength);
211 if(buffer==NULL) {
212 // unexpected failure to reserve some string capacity
213 errorCode=U_MEMORY_ALLOCATION_ERROR;
214 goto exit;
215 }
216 pb=bytes;
217 pu=buffer;
218 ucnv_toUnicode(
219 cnv,
220 &pu, buffer+src.getCapacity(),
221 &pb, bytes+bytesLength,
222 NULL, TRUE, &errorCode);
223 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
224 ucnv_close(cnv);
225 cnv=NULL;
226 if(U_FAILURE(errorCode)) {
227 // unexpected error in conversion from Latin-1
228 src.remove();
229 goto exit;
230 }
231
232 // parse XML declaration
233 if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
234 int32_t declEnd=mXMLDecl.end(errorCode);
235 // go beyond <?xml
236 int32_t pos=src.indexOf((UChar)x_l)+1;
237
238 mAttrValue.reset(src);
239 while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element.
240 UnicodeString attName = mAttrValue.group(1, errorCode);
241 UnicodeString attValue = mAttrValue.group(2, errorCode);
242
243 // Trim the quotes from the att value. These are left over from the original regex
244 // that parsed the attribue, which couldn't conveniently strip them.
245 attValue.remove(0,1); // one char from the beginning
246 attValue.truncate(attValue.length()-1); // and one from the end.
247
248 if(attName==UNICODE_STRING("encoding", 8)) {
249 length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
250 charset=charsetBuffer;
251 break;
252 }
253 pos = mAttrValue.end(2, errorCode);
254 }
255
256 if(charset==NULL) {
257 // default to UTF-8
258 charset="UTF-8";
259 }
260 cnv=ucnv_open(charset, &errorCode);
261 }
262 }
263
264 if(U_FAILURE(errorCode)) {
265 // unable to open the converter
266 goto exit;
267 }
268
269 // convert the file contents
270 capacity=fileLength; // estimated capacity
271 src.getBuffer(capacity);
272 src.releaseBuffer(0); // zero length
273 flush=FALSE;
274 for(;;) {
275 // convert contents of bytes[bytesLength]
276 pb=bytes;
277 for(;;) {
278 length=src.length();
279 buffer=src.getBuffer(capacity);
280 if(buffer==NULL) {
281 // unexpected failure to reserve some string capacity
282 errorCode=U_MEMORY_ALLOCATION_ERROR;
283 goto exit;
284 }
285
286 pu=buffer+length;
287 ucnv_toUnicode(
288 cnv, &pu, buffer+src.getCapacity(),
289 &pb, bytes+bytesLength,
290 NULL, FALSE, &errorCode);
291 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
292 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
293 errorCode=U_ZERO_ERROR;
294 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
295 } else {
296 break;
297 }
298 }
299
300 if(U_FAILURE(errorCode)) {
301 break; // conversion error
302 }
303
304 if(flush) {
305 break; // completely converted the file
306 }
307
308 // read next block
309 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
310 if(bytesLength==0) {
311 // reached end of file, convert once more to flush the converter
312 flush=TRUE;
313 }
314 };
315
316 exit:
317 ucnv_close(cnv);
318 T_FileStream_close(f);
319
320 if(U_SUCCESS(errorCode)) {
321 return parse(src, errorCode);
322 } else {
323 return NULL;
324 }
325 }
326
327 UXMLElement *
parse(const UnicodeString & src,UErrorCode & status)328 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
329 if(U_FAILURE(status)) {
330 return NULL;
331 }
332
333 UXMLElement *root = NULL;
334 fPos = 0; // TODO use just a local pos variable and pass it into functions
335 // where necessary?
336
337 // set all matchers to work on the input string
338 mXMLDecl.reset(src);
339 mXMLComment.reset(src);
340 mXMLSP.reset(src);
341 mXMLDoctype.reset(src);
342 mXMLPI.reset(src);
343 mXMLElemStart.reset(src);
344 mXMLElemEnd.reset(src);
345 mXMLElemEmpty.reset(src);
346 mXMLCharData.reset(src);
347 mAttrValue.reset(src);
348 mAttrNormalizer.reset(src);
349 mNewLineNormalizer.reset(src);
350 mAmps.reset(src);
351
352 // Consume the XML Declaration, if present.
353 if (mXMLDecl.lookingAt(fPos, status)) {
354 fPos = mXMLDecl.end(status);
355 }
356
357 // Consume "misc" [XML production 27] appearing before DocType
358 parseMisc(status);
359
360 // Consume a DocType declaration, if present.
361 if (mXMLDoctype.lookingAt(fPos, status)) {
362 fPos = mXMLDoctype.end(status);
363 }
364
365 // Consume additional "misc" [XML production 27] appearing after the DocType
366 parseMisc(status);
367
368 // Get the root element
369 if (mXMLElemEmpty.lookingAt(fPos, status)) {
370 // Root is an empty element (no nested elements or content)
371 root = createElement(mXMLElemEmpty, status);
372 fPos = mXMLElemEmpty.end(status);
373 } else {
374 if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
375 error("Root Element expected", status);
376 goto errorExit;
377 }
378 root = createElement(mXMLElemStart, status);
379 UXMLElement *el = root;
380
381 //
382 // This is the loop that consumes the root element of the document,
383 // including all nested content. Nested elements are handled by
384 // explicit pushes/pops of the element stack; there is no recursion
385 // in the control flow of this code.
386 // "el" always refers to the current element, the one to which content
387 // is being added. It is above the top of the element stack.
388 for (;;) {
389 // Nested Element Start
390 if (mXMLElemStart.lookingAt(fPos, status)) {
391 UXMLElement *t = createElement(mXMLElemStart, status);
392 el->fChildren.addElement(t, status);
393 t->fParent = el;
394 fElementStack.push(el, status);
395 el = t;
396 continue;
397 }
398
399 // Text Content. String is concatenated onto the current node's content,
400 // but only if it contains something other than spaces.
401 UnicodeString s = scanContent(status);
402 if (s.length() > 0) {
403 mXMLSP.reset(s);
404 if (mXMLSP.matches(status) == FALSE) {
405 // This chunk of text contains something other than just
406 // white space. Make a child node for it.
407 replaceCharRefs(s, status);
408 el->fChildren.addElement(s.clone(), status);
409 }
410 mXMLSP.reset(src); // The matchers need to stay set to the main input string.
411 continue;
412 }
413
414 // Comments. Discard.
415 if (mXMLComment.lookingAt(fPos, status)) {
416 fPos = mXMLComment.end(status);
417 continue;
418 }
419
420 // PIs. Discard.
421 if (mXMLPI.lookingAt(fPos, status)) {
422 fPos = mXMLPI.end(status);
423 continue;
424 }
425
426 // Element End
427 if (mXMLElemEnd.lookingAt(fPos, status)) {
428 fPos = mXMLElemEnd.end(0, status);
429 const UnicodeString name = mXMLElemEnd.group(1, status);
430 if (name != *el->fName) {
431 error("Element start / end tag mismatch", status);
432 goto errorExit;
433 }
434 if (fElementStack.empty()) {
435 // Close of the root element. We're done with the doc.
436 el = NULL;
437 break;
438 }
439 el = (UXMLElement *)fElementStack.pop();
440 continue;
441 }
442
443 // Empty Element. Stored as a child of the current element, but not stacked.
444 if (mXMLElemEmpty.lookingAt(fPos, status)) {
445 UXMLElement *t = createElement(mXMLElemEmpty, status);
446 el->fChildren.addElement(t, status);
447 continue;
448 }
449
450 // Hit something within the document that doesn't match anything.
451 // It's an error.
452 error("Unrecognized markup", status);
453 break;
454 }
455
456 if (el != NULL || !fElementStack.empty()) {
457 // We bailed out early, for some reason.
458 error("Root element not closed.", status);
459 goto errorExit;
460 }
461 }
462
463 // Root Element parse is complete.
464 // Consume the annoying xml "Misc" that can appear at the end of the doc.
465 parseMisc(status);
466
467 // We should have reached the end of the input
468 if (fPos != src.length()) {
469 error("Extra content at the end of the document", status);
470 goto errorExit;
471 }
472
473 // Success!
474 return root;
475
476 errorExit:
477 delete root;
478 return NULL;
479 }
480
481 //
482 // createElement
483 // We've just matched an element start tag. Create and fill in a UXMLElement object
484 // for it.
485 //
486 UXMLElement *
createElement(RegexMatcher & mEl,UErrorCode & status)487 UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) {
488 // First capture group is the element's name.
489 UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
490
491 // Scan for attributes.
492 int32_t pos = mEl.end(1, status); // The position after the end of the tag name
493
494 while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element.
495 UnicodeString attName = mAttrValue.group(1, status);
496 UnicodeString attValue = mAttrValue.group(2, status);
497
498 // Trim the quotes from the att value. These are left over from the original regex
499 // that parsed the attribue, which couldn't conveniently strip them.
500 attValue.remove(0,1); // one char from the beginning
501 attValue.truncate(attValue.length()-1); // and one from the end.
502
503 // XML Attribue value normalization.
504 // This is one of the really screwy parts of the XML spec.
505 // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
506 // Note that non-validating parsers must treat all entities as type CDATA
507 // which simplifies things some.
508
509 // Att normalization step 1: normalize any newlines in the attribute value
510 mNewLineNormalizer.reset(attValue);
511 attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
512
513 // Next change all xml white space chars to plain \u0020 spaces.
514 mAttrNormalizer.reset(attValue);
515 UnicodeString oneSpace((UChar)0x0020);
516 attValue = mAttrNormalizer.replaceAll(oneSpace, status);
517
518 // Replace character entities.
519 replaceCharRefs(attValue, status);
520
521 // Save the attribute name and value in our document structure.
522 el->fAttNames.addElement((void *)intern(attName, status), status);
523 el->fAttValues.addElement(attValue.clone(), status);
524 pos = mAttrValue.end(2, status);
525 }
526 fPos = mEl.end(0, status);
527 return el;
528 }
529
530 //
531 // parseMisc
532 // Consume XML "Misc" [production #27]
533 // which is any combination of space, PI and comments
534 // Need to watch end-of-input because xml MISC stuff is allowed after
535 // the document element, so we WILL scan off the end in this function
536 //
537 void
parseMisc(UErrorCode & status)538 UXMLParser::parseMisc(UErrorCode &status) {
539 for (;;) {
540 if (fPos >= mXMLPI.input().length()) {
541 break;
542 }
543 if (mXMLPI.lookingAt(fPos, status)) {
544 fPos = mXMLPI.end(status);
545 continue;
546 }
547 if (mXMLSP.lookingAt(fPos, status)) {
548 fPos = mXMLSP.end(status);
549 continue;
550 }
551 if (mXMLComment.lookingAt(fPos, status)) {
552 fPos = mXMLComment.end(status);
553 continue;
554 }
555 break;
556 }
557 }
558
559 //
560 // Scan for document content.
561 //
562 UnicodeString
scanContent(UErrorCode & status)563 UXMLParser::scanContent(UErrorCode &status) {
564 UnicodeString result;
565 if (mXMLCharData.lookingAt(fPos, status)) {
566 result = mXMLCharData.group((int32_t)0, status);
567 // Normalize the new-lines. (Before char ref substitution)
568 mNewLineNormalizer.reset(result);
569 result = mNewLineNormalizer.replaceAll(fOneLF, status);
570
571 // TODO: handle CDATA
572 fPos = mXMLCharData.end(0, status);
573 }
574
575 return result;
576 }
577
578 //
579 // replaceCharRefs
580 //
581 // replace the char entities < & { ካ etc. in a string
582 // with the corresponding actual character.
583 //
584 void
replaceCharRefs(UnicodeString & s,UErrorCode & status)585 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
586 UnicodeString result;
587 UnicodeString replacement;
588 int i;
589
590 mAmps.reset(s);
591 // See the initialization for the regex matcher mAmps.
592 // Which entity we've matched is determined by which capture group has content,
593 // which is flaged by start() of that group not being -1.
594 while (mAmps.find()) {
595 if (mAmps.start(1, status) != -1) {
596 replacement.setTo((UChar)x_AMP);
597 } else if (mAmps.start(2, status) != -1) {
598 replacement.setTo((UChar)x_LT);
599 } else if (mAmps.start(3, status) != -1) {
600 replacement.setTo((UChar)x_GT);
601 } else if (mAmps.start(4, status) != -1) {
602 replacement.setTo((UChar)x_APOS);
603 } else if (mAmps.start(5, status) != -1) {
604 replacement.setTo((UChar)x_QUOT);
605 } else if (mAmps.start(6, status) != -1) {
606 UnicodeString hexString = mAmps.group(6, status);
607 UChar32 val = 0;
608 for (i=0; i<hexString.length(); i++) {
609 val = (val << 4) + u_digit(hexString.charAt(i), 16);
610 }
611 // TODO: some verification that the character is valid
612 replacement.setTo(val);
613 } else if (mAmps.start(7, status) != -1) {
614 UnicodeString decimalString = mAmps.group(7, status);
615 UChar32 val = 0;
616 for (i=0; i<decimalString.length(); i++) {
617 val = val*10 + u_digit(decimalString.charAt(i), 10);
618 }
619 // TODO: some verification that the character is valid
620 replacement.setTo(val);
621 } else {
622 // An unrecognized &entity; Leave it alone.
623 // TODO: check that it really looks like an entity, and is not some
624 // random & in the text.
625 replacement = mAmps.group((int32_t)0, status);
626 }
627 mAmps.appendReplacement(result, replacement, status);
628 }
629 mAmps.appendTail(result);
630 s = result;
631 }
632
633 void
error(const char * message,UErrorCode & status)634 UXMLParser::error(const char *message, UErrorCode &status) {
635 // TODO: something better here...
636 const UnicodeString &src=mXMLDecl.input();
637 int line = 0;
638 int ci = 0;
639 while (ci < fPos && ci>=0) {
640 ci = src.indexOf((UChar)0x0a, ci+1);
641 line++;
642 }
643 fprintf(stderr, "Error: %s at line %d\n", message, line);
644 if (U_SUCCESS(status)) {
645 status = U_PARSE_ERROR;
646 }
647 }
648
649 // intern strings like in Java
650
651 const UnicodeString *
intern(const UnicodeString & s,UErrorCode & errorCode)652 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
653 const UHashElement *he=fNames.find(s);
654 if(he!=NULL) {
655 // already a known name, return its hashed key pointer
656 return (const UnicodeString *)he->key.pointer;
657 } else {
658 // add this new name and return its hashed key pointer
659 fNames.puti(s, 0, errorCode);
660 he=fNames.find(s);
661 return (const UnicodeString *)he->key.pointer;
662 }
663 }
664
665 const UnicodeString *
findName(const UnicodeString & s) const666 UXMLParser::findName(const UnicodeString &s) const {
667 const UHashElement *he=fNames.find(s);
668 if(he!=NULL) {
669 // a known name, return its hashed key pointer
670 return (const UnicodeString *)he->key.pointer;
671 } else {
672 // unknown name
673 return NULL;
674 }
675 }
676
677 // UXMLElement ------------------------------------------------------------- ***
678
UXMLElement(const UXMLParser * parser,const UnicodeString * name,UErrorCode & errorCode)679 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
680 fParser(parser),
681 fName(name),
682 fAttNames(errorCode),
683 fAttValues(errorCode),
684 fChildren(errorCode),
685 fParent(NULL)
686 {
687 }
688
~UXMLElement()689 UXMLElement::~UXMLElement() {
690 int i;
691 // attribute names are owned by the UXMLParser, don't delete them here
692 for (i=fAttValues.size()-1; i>=0; i--) {
693 delete (UObject *)fAttValues.elementAt(i);
694 }
695 for (i=fChildren.size()-1; i>=0; i--) {
696 delete (UObject *)fChildren.elementAt(i);
697 }
698 }
699
700 const UnicodeString &
getTagName() const701 UXMLElement::getTagName() const {
702 return *fName;
703 }
704
705 UnicodeString
getText(UBool recurse) const706 UXMLElement::getText(UBool recurse) const {
707 UnicodeString text;
708 appendText(text, recurse);
709 return text;
710 }
711
712 void
appendText(UnicodeString & text,UBool recurse) const713 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
714 const UObject *node;
715 int32_t i, count=fChildren.size();
716 for(i=0; i<count; ++i) {
717 node=(const UObject *)fChildren.elementAt(i);
718 const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
719 if(s!=NULL) {
720 text.append(*s);
721 } else if(recurse) /* must be a UXMLElement */ {
722 ((const UXMLElement *)node)->appendText(text, recurse);
723 }
724 }
725 }
726
727 int32_t
countAttributes() const728 UXMLElement::countAttributes() const {
729 return fAttNames.size();
730 }
731
732 const UnicodeString *
getAttribute(int32_t i,UnicodeString & name,UnicodeString & value) const733 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
734 if(0<=i && i<fAttNames.size()) {
735 name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
736 value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
737 return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
738 } else {
739 return NULL;
740 }
741 }
742
743 const UnicodeString *
getAttribute(const UnicodeString & name) const744 UXMLElement::getAttribute(const UnicodeString &name) const {
745 // search for the attribute name by comparing the interned pointer,
746 // not the string contents
747 const UnicodeString *p=fParser->findName(name);
748 if(p==NULL) {
749 return NULL; // no such attribute seen by the parser at all
750 }
751
752 int32_t i, count=fAttNames.size();
753 for(i=0; i<count; ++i) {
754 if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
755 return (const UnicodeString *)fAttValues.elementAt(i);
756 }
757 }
758 return NULL;
759 }
760
761 int32_t
countChildren() const762 UXMLElement::countChildren() const {
763 return fChildren.size();
764 }
765
766 const UObject *
getChild(int32_t i,UXMLNodeType & type) const767 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
768 if(0<=i && i<fChildren.size()) {
769 const UObject *node=(const UObject *)fChildren.elementAt(i);
770 if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
771 type=UXML_NODE_TYPE_ELEMENT;
772 } else {
773 type=UXML_NODE_TYPE_STRING;
774 }
775 return node;
776 } else {
777 return NULL;
778 }
779 }
780
781 const UXMLElement *
nextChildElement(int32_t & i) const782 UXMLElement::nextChildElement(int32_t &i) const {
783 if(i<0) {
784 return NULL;
785 }
786
787 const UObject *node;
788 int32_t count=fChildren.size();
789 while(i<count) {
790 node=(const UObject *)fChildren.elementAt(i++);
791 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
792 if(elem!=NULL) {
793 return elem;
794 }
795 }
796 return NULL;
797 }
798
799 const UXMLElement *
getChildElement(const UnicodeString & name) const800 UXMLElement::getChildElement(const UnicodeString &name) const {
801 // search for the element name by comparing the interned pointer,
802 // not the string contents
803 const UnicodeString *p=fParser->findName(name);
804 if(p==NULL) {
805 return NULL; // no such element seen by the parser at all
806 }
807
808 const UObject *node;
809 int32_t i, count=fChildren.size();
810 for(i=0; i<count; ++i) {
811 node=(const UObject *)fChildren.elementAt(i);
812 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
813 if(elem!=NULL) {
814 if(p==elem->fName) {
815 return elem;
816 }
817 }
818 }
819 return NULL;
820 }
821
822 U_NAMESPACE_END
823
824 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
825
826