1 /*
2 Copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
3
4 This software is provided 'as-is', without any express or implied
5 warranty. In no event will the authors be held liable for any
6 damages arising from the use of this software.
7
8 Permission is granted to anyone to use this software for any
9 purpose, including commercial applications, and to alter it and
10 redistribute it freely, subject to the following restrictions:
11
12 1. The origin of this software must not be misrepresented; you must
13 not claim that you wrote the original software. If you use this
14 software in a product, an acknowledgment in the product documentation
15 would be appreciated but is not required.
16
17 2. Altered source versions must be plainly marked as such, and
18 must not be misrepresented as being the original software.
19
20 3. This notice may not be removed or altered from any source
21 distribution.
22 */
23
24 #include "tinyxml.h"
25 #include <ctype.h>
26 #include <strstream>
27 using namespace std;
28
29 //#define DEBUG_PARSER
30
31 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
32 {
33 { "&", 5, '&' },
34 { "<", 4, '<' },
35 { ">", 4, '>' },
36 { """, 6, '\"' },
37 { "'", 6, '\'' }
38 };
39
40
SkipWhiteSpace(const char * p)41 const char* TiXmlBase::SkipWhiteSpace( const char* p )
42 {
43 if ( !p || !*p )
44 {
45 return 0;
46 }
47 while ( p && *p )
48 {
49 if ( isspace( *p ) || *p == '\n' || *p =='\r' ) // Still using old rules for white space.
50 ++p;
51 else
52 break;
53 }
54
55 return p;
56 }
57
58
StreamWhiteSpace(std::istream * in,std::string * tag)59 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream* in, std::string* tag )
60 {
61 for( ;; )
62 {
63 if ( !in->good() ) return false;
64
65 int c = in->peek();
66 if ( !IsWhiteSpace( c ) )
67 return true;
68 *tag += in->get();
69 }
70 }
71
72
StreamTo(std::istream * in,int character,std::string * tag)73 /*static*/ bool TiXmlBase::StreamTo( std::istream* in, int character, std::string* tag )
74 {
75 while ( in->good() )
76 {
77 int c = in->peek();
78 if ( c == character )
79 return true;
80
81 in->get();
82 *tag += c;
83 }
84 return false;
85 }
86
87
ReadName(const char * p,string * name)88 const char* TiXmlBase::ReadName( const char* p, string* name )
89 {
90 *name = "";
91 assert( p );
92
93 // Names start with letters or underscores.
94 // After that, they can be letters, underscores, numbers,
95 // hyphens, or colons. (Colons are valid ony for namespaces,
96 // but tinyxml can't tell namespaces from names.)
97 if ( p && *p
98 && ( isalpha( (unsigned char) *p ) || *p == '_' ) )
99 {
100 while( p && *p
101 && ( isalnum( (unsigned char ) *p )
102 || *p == '_'
103 || *p == '-'
104 || *p == ':' ) )
105 {
106 (*name) += *p;
107 ++p;
108 }
109 return p;
110 }
111 return 0;
112 }
113
114
GetEntity(const char * p,char * value)115 const char* TiXmlBase::GetEntity( const char* p, char* value )
116 {
117 // Presume an entity, and pull it out.
118 string ent;
119 int i;
120
121 // Ignore the &#x entities.
122 if ( strncmp( "&#x", p, 3 ) == 0 )
123 {
124 *value = *p;
125 return p+1;
126 }
127
128 // Now try to match it.
129 for( i=0; i<NUM_ENTITY; ++i )
130 {
131 if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
132 {
133 assert( strlen( entity[i].str ) == entity[i].strLength );
134 *value = entity[i].chr;
135 return ( p + entity[i].strLength );
136 }
137 }
138
139 // So it wasn't an entity, its unrecognized, or something like that.
140 *value = *p; // Don't put back the last one, since we return it!
141 return p+1;
142 }
143
144
StringEqual(const char * p,const char * tag,bool ignoreCase)145 bool TiXmlBase::StringEqual( const char* p,
146 const char* tag,
147 bool ignoreCase )
148 {
149 assert( p );
150 if ( !p || !*p )
151 {
152 assert( 0 );
153 return false;
154 }
155
156 if ( tolower( *p ) == tolower( *tag ) )
157 {
158 const char* q = p;
159
160 if (ignoreCase)
161 {
162 while ( *q && *tag && *q == *tag )
163 {
164 ++q;
165 ++tag;
166 }
167
168 if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
169 {
170 return true;
171 }
172 }
173 else
174 {
175 while ( *q && *tag && tolower( *q ) == tolower( *tag ) )
176 {
177 ++q;
178 ++tag;
179 }
180
181 if ( *tag == 0 )
182 {
183 return true;
184 }
185 }
186 }
187 return false;
188 }
189
190
ReadText(const char * p,string * text,bool trimWhiteSpace,const char * endTag,bool caseInsensitive)191 const char* TiXmlBase::ReadText( const char* p,
192 string* text,
193 bool trimWhiteSpace,
194 const char* endTag,
195 bool caseInsensitive )
196 {
197 *text = "";
198
199 if ( !trimWhiteSpace // certain tags always keep whitespace
200 || !condenseWhiteSpace ) // if true, whitespace is always kept
201 {
202 // Keep all the white space.
203 while ( p && *p
204 && !StringEqual( p, endTag, caseInsensitive )
205 )
206 {
207 char c;
208 p = GetChar( p, &c );
209 text->append( &c, 1 );
210 }
211 }
212 else
213 {
214 bool whitespace = false;
215
216 // Remove leading white space:
217 p = SkipWhiteSpace( p );
218 while ( p && *p
219 && !StringEqual( p, endTag, caseInsensitive ) )
220 {
221 if ( *p == '\r' || *p == '\n' )
222 {
223 whitespace = true;
224 ++p;
225 }
226 else if ( isspace( *p ) )
227 {
228 whitespace = true;
229 ++p;
230 }
231 else
232 {
233 // If we've found whitespace, add it before the
234 // new character. Any whitespace just becomes a space.
235 if ( whitespace )
236 {
237 text->append( " ", 1 );
238 whitespace = false;
239 }
240 char c;
241 p = GetChar( p, &c );
242 text->append( &c, 1 );
243 }
244 }
245 }
246 return p + strlen( endTag );
247 }
248
249
StreamIn(std::istream * in,std::string * tag)250 void TiXmlDocument::StreamIn( std::istream* in, std::string* tag )
251 {
252 // The basic issue with a document is that we don't know what we're
253 // streaming. Read something presumed to be a tag (and hope), then
254 // identify it, and call the appropriate stream method on the tag.
255 //
256 // This "pre-streaming" will never read the closing ">" so the
257 // sub-tag can orient itself.
258
259 if ( !StreamTo( in, '<', tag ) )
260 {
261 SetError( TIXML_ERROR_PARSING_EMPTY );
262 return;
263 }
264
265 while ( in->good() )
266 {
267 int tagIndex = tag->length();
268 while ( in->good() && in->peek() != '>' )
269 {
270 int c = in->get();
271 (*tag) += (char) c;
272 }
273
274 if ( in->good() )
275 {
276 // We now have something we presume to be a node of
277 // some sort. Identify it, and call the node to
278 // continue streaming.
279 TiXmlNode* node = Identify( tag->c_str() + tagIndex );
280
281 if ( node )
282 {
283 node->StreamIn( in, tag );
284 bool isElement = node->ToElement() != 0;
285 delete node;
286 node = 0;
287
288 // If this is the root element, we're done. Parsing will be
289 // done by the >> operator.
290 if ( isElement )
291 {
292 return;
293 }
294 }
295 else
296 {
297 SetError( TIXML_ERROR );
298 return;
299 }
300 }
301 }
302 // We should have returned sooner.
303 SetError( TIXML_ERROR );
304 }
305
306
Parse(const char * p)307 const char* TiXmlDocument::Parse( const char* p )
308 {
309 // Parse away, at the document level. Since a document
310 // contains nothing but other tags, most of what happens
311 // here is skipping white space.
312 //
313 // In this variant (as opposed to stream and Parse) we
314 // read everything we can.
315
316
317 if ( !p || !*p || !( p = SkipWhiteSpace( p ) ) )
318 {
319 SetError( TIXML_ERROR_DOCUMENT_EMPTY );
320 return false;
321 }
322
323 while ( p && *p )
324 {
325 TiXmlNode* node = Identify( p );
326 if ( node )
327 {
328 p = node->Parse( p );
329 LinkEndChild( node );
330 }
331 else
332 {
333 break;
334 }
335 p = SkipWhiteSpace( p );
336 }
337 // All is well.
338 return p;
339 }
340
341
Identify(const char * p)342 TiXmlNode* TiXmlNode::Identify( const char* p )
343 {
344 TiXmlNode* returnNode = 0;
345
346 p = SkipWhiteSpace( p );
347 if( !p || !*p || *p != '<' )
348 {
349 return 0;
350 }
351
352 TiXmlDocument* doc = GetDocument();
353 p = SkipWhiteSpace( p );
354
355 if ( !p || !*p )
356 {
357 return 0;
358 }
359
360 // What is this thing?
361 // - Elements start with a letter or underscore, but xml is reserved.
362 // - Comments: <!--
363 // - Decleration: <?xml
364 // - Everthing else is unknown to tinyxml.
365 //
366
367 const char* xmlHeader = { "<?xml" };
368 const char* commentHeader = { "<!--" };
369
370 if ( StringEqual( p, xmlHeader, true ) )
371 {
372 #ifdef DEBUG_PARSER
373 TIXML_LOG( "XML parsing Declaration\n" );
374 #endif
375 returnNode = new TiXmlDeclaration();
376 }
377 else if ( isalpha( *(p+1) )
378 || *(p+1) == '_' )
379 {
380 #ifdef DEBUG_PARSER
381 TIXML_LOG( "XML parsing Element\n" );
382 #endif
383 returnNode = new TiXmlElement( "" );
384 }
385 else if ( StringEqual( p, commentHeader, false ) )
386 {
387 #ifdef DEBUG_PARSER
388 TIXML_LOG( "XML parsing Comment\n" );
389 #endif
390 returnNode = new TiXmlComment();
391 }
392 else
393 {
394 #ifdef DEBUG_PARSER
395 TIXML_LOG( "XML parsing Unknown\n" );
396 #endif
397 returnNode = new TiXmlUnknown();
398 }
399
400 if ( returnNode )
401 {
402 // Set the parent, so it can report errors
403 returnNode->parent = this;
404 //p = returnNode->Parse( p );
405 }
406 else
407 {
408 if ( doc )
409 doc->SetError( TIXML_ERROR_OUT_OF_MEMORY );
410 }
411 return returnNode;
412 }
413
414
StreamIn(std::istream * in,std::string * tag)415 void TiXmlElement::StreamIn( std::istream* in, std::string* tag )
416 {
417 // We're called with some amount of pre-parsing. That is, some of "this"
418 // element is in "tag". Go ahead and stream to the closing ">"
419 while( in->good() )
420 {
421 int c = in->get();
422 (*tag) += (char) c ;
423
424 if ( c == '>' )
425 break;
426 }
427
428 if ( tag->length() < 3 ) return;
429
430 // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
431 // If not, identify and stream.
432
433 if ( tag->at( tag->length() - 1 ) == '>'
434 && tag->at( tag->length() - 2 ) == '/' )
435 {
436 // All good!
437 return;
438 }
439 else if ( tag->at( tag->length() - 1 ) == '>' )
440 {
441 // There is more. Could be:
442 // text
443 // closing tag
444 // another node.
445 for ( ;; )
446 {
447 StreamWhiteSpace( in, tag );
448
449 // Do we have text?
450 if ( in->peek() != '<' )
451 {
452 // Yep, text.
453 TiXmlText text( "" );
454 text.StreamIn( in, tag );
455
456 // What follows text is a closing tag or another node.
457 // Go around again and figure it out.
458 continue;
459 }
460
461 // We now have either a closing tag...or another node.
462 // We should be at a "<", regardless.
463 if ( !in->good() ) return;
464 assert( in->peek() == '<' );
465 int tagIndex = tag->length();
466
467 bool closingTag = false;
468 bool firstCharFound = false;
469
470 for( ;; )
471 {
472 if ( !in->good() )
473 return;
474
475 int c = in->peek();
476
477 if ( c == '>' )
478 break;
479
480 *tag += c;
481 in->get();
482
483 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
484 {
485 firstCharFound = true;
486 if ( c == '/' )
487 closingTag = true;
488 }
489 }
490 // If it was a closing tag, then read in the closing '>' to clean up the input stream.
491 // If it was not, the streaming will be done by the tag.
492 if ( closingTag )
493 {
494 int c = in->get();
495 assert( c == '>' );
496 *tag += c;
497
498 // We are done, once we've found our closing tag.
499 return;
500 }
501 else
502 {
503 // If not a closing tag, id it, and stream.
504 const char* tagloc = tag->c_str() + tagIndex;
505 TiXmlNode* node = Identify( tagloc );
506 if ( !node )
507 return;
508 node->StreamIn( in, tag );
509 delete node;
510 node = 0;
511
512 // No return: go around from the beginning: text, closing tag, or node.
513 }
514 }
515 }
516 }
517
518
Parse(const char * p)519 const char* TiXmlElement::Parse( const char* p )
520 {
521 p = SkipWhiteSpace( p );
522 TiXmlDocument* document = GetDocument();
523
524 if ( !p || !*p || *p != '<' )
525 {
526 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT );
527 return false;
528 }
529
530 p = SkipWhiteSpace( p+1 );
531
532 // Read the name.
533 p = ReadName( p, &value );
534 if ( !p || !*p )
535 {
536 if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME );
537 return false;
538 }
539
540 string endTag = "</";
541 endTag += value;
542 endTag += ">";
543
544 // Check for and read attributes. Also look for an empty
545 // tag or an end tag.
546 while ( p && *p )
547 {
548 p = SkipWhiteSpace( p );
549 if ( !p || !*p )
550 {
551 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
552 return 0;
553 }
554 if ( *p == '/' )
555 {
556 ++p;
557 // Empty tag.
558 if ( *p != '>' )
559 {
560 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY );
561 return 0;
562 }
563 return (p+1);
564 }
565 else if ( *p == '>' )
566 {
567 // Done with attributes (if there were any.)
568 // Read the value -- which can include other
569 // elements -- read the end tag, and return.
570 ++p;
571 p = ReadValue( p ); // Note this is an Element method, and will set the error if one happens.
572 if ( !p || !*p )
573 return 0;
574
575 // We should find the end tag now
576 if ( StringEqual( p, endTag.c_str(), false ) )
577 {
578 p += endTag.length();
579 return p;
580 }
581 else
582 {
583 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG );
584 return 0;
585 }
586 }
587 else
588 {
589 // Try to read an element:
590 TiXmlAttribute attrib;
591 attrib.SetDocument( document );
592 p = attrib.Parse( p );
593
594 if ( !p || !*p )
595 {
596 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT );
597 return 0;
598 }
599 SetAttribute( attrib.Name(), attrib.Value() );
600 }
601 }
602 return p;
603 }
604
605
ReadValue(const char * p)606 const char* TiXmlElement::ReadValue( const char* p )
607 {
608 TiXmlDocument* document = GetDocument();
609
610 // Read in text and elements in any order.
611 p = SkipWhiteSpace( p );
612 while ( p && *p )
613 {
614 // string text;
615 // while ( p && *p && *p != '<' )
616 // {
617 // text += (*p);
618 // ++p;
619 // }
620 //
621 // p = SkipWhiteSpace( p );
622
623 if ( *p != '<' )
624 {
625 // Take what we have, make a text element.
626 TiXmlText* textNode = new TiXmlText( "" );
627
628 if ( !textNode )
629 {
630 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY );
631 return 0;
632 }
633
634 p = textNode->Parse( p );
635
636 if ( !textNode->Blank() )
637 LinkEndChild( textNode );
638 else
639 delete textNode;
640 }
641 else
642 {
643 // We hit a '<'
644 // Have we hit a new element or an end tag?
645 if ( StringEqual( p, "</", false ) )
646 {
647 return p;
648 }
649 else
650 {
651 TiXmlNode* node = Identify( p );
652 if ( node )
653 {
654 p = node->Parse( p );
655 LinkEndChild( node );
656 }
657 else
658 {
659 return 0;
660 }
661 }
662 }
663 p = SkipWhiteSpace( p );
664 }
665
666 if ( !p )
667 {
668 if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE );
669 }
670 return p;
671 }
672
673
StreamIn(std::istream * in,std::string * tag)674 void TiXmlUnknown::StreamIn( std::istream* in, std::string* tag )
675 {
676 while ( in->good() )
677 {
678 int c = in->get();
679 (*tag) += c;
680
681 if ( c == '>' )
682 {
683 // All is well.
684 return;
685 }
686 }
687 }
688
689
Parse(const char * p)690 const char* TiXmlUnknown::Parse( const char* p )
691 {
692 TiXmlDocument* document = GetDocument();
693 p = SkipWhiteSpace( p );
694 if ( !p || !*p || *p != '<' )
695 {
696 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN );
697 return 0;
698 }
699 ++p;
700 value = "";
701
702 while ( *p && *p != '>' )
703 {
704 value += *p;
705 ++p;
706 }
707
708 if ( *p == '>' )
709 return p+1;
710 return p;
711 }
712
713
StreamIn(std::istream * in,std::string * tag)714 void TiXmlComment::StreamIn( std::istream* in, std::string* tag )
715 {
716 while ( in->good() )
717 {
718 int c = in->get();
719 (*tag) += c;
720
721 if ( c == '>'
722 && tag->at( tag->length() - 2 ) == '-'
723 && tag->at( tag->length() - 3 ) == '-' )
724 {
725 // All is well.
726 return;
727 }
728 }
729 }
730
731
Parse(const char * p)732 const char* TiXmlComment::Parse( const char* p )
733 {
734 TiXmlDocument* document = GetDocument();
735 value = "";
736
737 p = SkipWhiteSpace( p );
738 const char* startTag = "<!--";
739 const char* endTag = "-->";
740
741 if ( !StringEqual( p, startTag, false ) )
742 {
743 document->SetError( TIXML_ERROR_PARSING_COMMENT );
744 return 0;
745 }
746 p += strlen( startTag );
747 p = ReadText( p, &value, false, endTag, false );
748 return p;
749 }
750
751
Parse(const char * p)752 const char* TiXmlAttribute::Parse( const char* p )
753 {
754 p = SkipWhiteSpace( p );
755 if ( !p || !*p ) return 0;
756
757 // Read the name, the '=' and the value.
758 p = ReadName( p, &name );
759 if ( !p || !*p )
760 {
761 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
762 return 0;
763 }
764 p = SkipWhiteSpace( p );
765 if ( !p || !*p || *p != '=' )
766 {
767 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
768 return 0;
769 }
770
771 ++p; // skip '='
772 p = SkipWhiteSpace( p );
773 if ( !p || !*p )
774 {
775 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
776 return 0;
777 }
778
779 const char* end;
780
781 if ( *p == '\'' )
782 {
783 ++p;
784 end = "\'";
785 p = ReadText( p, &value, false, end, false );
786 }
787 else if ( *p == '"' )
788 {
789 ++p;
790 end = "\"";
791 p = ReadText( p, &value, false, end, false );
792 }
793 else
794 {
795 // All attribute values should be in single or double quotes.
796 // But this is such a common error that the parser will try
797 // its best, even without them.
798 value = "";
799 while ( p && *p // existence
800 && !isspace( *p ) && *p != '\n' && *p != '\r' // whitespace
801 && *p != '/' && *p != '>' ) // tag end
802 {
803 value += *p;
804 ++p;
805 }
806 }
807 return p;
808 }
809
810
StreamIn(std::istream * in,std::string * tag)811 void TiXmlText::StreamIn( std::istream* in, std::string* tag )
812 {
813 while ( in->good() )
814 {
815 int c = in->peek();
816 if ( c == '<' )
817 return;
818
819 (*tag) += c;
820 in->get();
821 }
822 }
823
824
825
Parse(const char * p)826 const char* TiXmlText::Parse( const char* p )
827 {
828 value = "";
829
830 //TiXmlDocument* doc = GetDocument();
831 bool ignoreWhite = true;
832 // if ( doc && !doc->IgnoreWhiteSpace() ) ignoreWhite = false;
833
834 const char* end = "<";
835 p = ReadText( p, &value, ignoreWhite, end, false );
836 if ( p )
837 return p-1; // don't truncate the '<'
838 return 0;
839 }
840
841
StreamIn(std::istream * in,std::string * tag)842 void TiXmlDeclaration::StreamIn( std::istream* in, std::string* tag )
843 {
844 while ( in->good() )
845 {
846 int c = in->get();
847 (*tag) += c;
848
849 if ( c == '>' )
850 {
851 // All is well.
852 return;
853 }
854 }
855 }
856
Parse(const char * p)857 const char* TiXmlDeclaration::Parse( const char* p )
858 {
859 p = SkipWhiteSpace( p );
860 // Find the beginning, find the end, and look for
861 // the stuff in-between.
862 TiXmlDocument* document = GetDocument();
863 if ( !p || !*p || !StringEqual( p, "<?xml", true ) )
864 {
865 if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION );
866 return 0;
867 }
868
869 p += 5;
870 // const char* start = p+5;
871 // const char* end = strstr( start, "?>" );
872
873 version = "";
874 encoding = "";
875 standalone = "";
876
877 while ( p && *p )
878 {
879 if ( *p == '>' )
880 {
881 ++p;
882 return p;
883 }
884
885 p = SkipWhiteSpace( p );
886 if ( StringEqual( p, "version", true ) )
887 {
888 // p += 7;
889 TiXmlAttribute attrib;
890 p = attrib.Parse( p );
891 version = attrib.Value();
892 }
893 else if ( StringEqual( p, "encoding", true ) )
894 {
895 // p += 8;
896 TiXmlAttribute attrib;
897 p = attrib.Parse( p );
898 encoding = attrib.Value();
899 }
900 else if ( StringEqual( p, "standalone", true ) )
901 {
902 // p += 10;
903 TiXmlAttribute attrib;
904 p = attrib.Parse( p );
905 standalone = attrib.Value();
906 }
907 else
908 {
909 // Read over whatever it is.
910 while( p && *p && *p != '>' && !isspace( *p ) )
911 ++p;
912 }
913 }
914 return 0;
915 }
916
Blank() const917 bool TiXmlText::Blank() const
918 {
919 for ( unsigned i=0; i<value.size(); i++ )
920 if ( !isspace( value[i] ) )
921 return false;
922 return true;
923 }
924
925