• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2000 Peter Kelly (pmk@post.com)
3  * Copyright (C) 2005, 2006, 2008 Apple Inc. All rights reserved.
4  * Copyright (C) 2006 Alexey Proskuryakov (ap@webkit.org)
5  * Copyright (C) 2007 Samuel Weinig (sam@webkit.org)
6  * Copyright (C) 2008 Nokia Corporation and/or its subsidiary(-ies)
7  * Copyright (C) 2008 Holger Hans Peter Freyther
8  * Copyright (C) 2008, 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/)
9  *
10  * This library is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Library General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This library is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Library General Public License for more details.
19  *
20  * You should have received a copy of the GNU Library General Public License
21  * along with this library; see the file COPYING.LIB.  If not, write to
22  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23  * Boston, MA 02110-1301, USA.
24  */
25 
26 #include "config.h"
27 #include "XMLTokenizer.h"
28 
29 #include "CDATASection.h"
30 #include "CString.h"
31 #include "CachedScript.h"
32 #include "Comment.h"
33 #include "DocLoader.h"
34 #include "Document.h"
35 #include "DocumentFragment.h"
36 #include "DocumentType.h"
37 #include "Frame.h"
38 #include "FrameLoader.h"
39 #include "FrameView.h"
40 #include "HTMLLinkElement.h"
41 #include "HTMLStyleElement.h"
42 #include "HTMLTokenizer.h" // for decodeNamedEntity
43 #include "ProcessingInstruction.h"
44 #include "ResourceError.h"
45 #include "ResourceHandle.h"
46 #include "ResourceRequest.h"
47 #include "ResourceResponse.h"
48 #include "ScriptController.h"
49 #include "ScriptElement.h"
50 #include "ScriptSourceCode.h"
51 #include "ScriptValue.h"
52 #include "TextResourceDecoder.h"
53 #include "XMLTokenizerScope.h"
54 #include <libxml/parser.h>
55 #include <libxml/parserInternals.h>
56 #include <wtf/Platform.h>
57 #include <wtf/StringExtras.h>
58 #include <wtf/Threading.h>
59 #include <wtf/UnusedParam.h>
60 #include <wtf/Vector.h>
61 
62 #if ENABLE(XSLT)
63 #include <libxslt/xslt.h>
64 #endif
65 
66 #if ENABLE(XHTMLMP)
67 #include "HTMLNames.h"
68 #include "HTMLScriptElement.h"
69 #endif
70 
71 using namespace std;
72 
73 namespace WebCore {
74 
75 class PendingCallbacks : public Noncopyable {
76 public:
~PendingCallbacks()77     ~PendingCallbacks()
78     {
79         deleteAllValues(m_callbacks);
80     }
81 
appendStartElementNSCallback(const xmlChar * xmlLocalName,const xmlChar * xmlPrefix,const xmlChar * xmlURI,int nb_namespaces,const xmlChar ** namespaces,int nb_attributes,int nb_defaulted,const xmlChar ** attributes)82     void appendStartElementNSCallback(const xmlChar* xmlLocalName, const xmlChar* xmlPrefix, const xmlChar* xmlURI, int nb_namespaces,
83                                       const xmlChar** namespaces, int nb_attributes, int nb_defaulted, const xmlChar** attributes)
84     {
85         PendingStartElementNSCallback* callback = new PendingStartElementNSCallback;
86 
87         callback->xmlLocalName = xmlStrdup(xmlLocalName);
88         callback->xmlPrefix = xmlStrdup(xmlPrefix);
89         callback->xmlURI = xmlStrdup(xmlURI);
90         callback->nb_namespaces = nb_namespaces;
91         callback->namespaces = static_cast<xmlChar**>(xmlMalloc(sizeof(xmlChar*) * nb_namespaces * 2));
92         for (int i = 0; i < nb_namespaces * 2 ; i++)
93             callback->namespaces[i] = xmlStrdup(namespaces[i]);
94         callback->nb_attributes = nb_attributes;
95         callback->nb_defaulted = nb_defaulted;
96         callback->attributes = static_cast<xmlChar**>(xmlMalloc(sizeof(xmlChar*) * nb_attributes * 5));
97         for (int i = 0; i < nb_attributes; i++) {
98             // Each attribute has 5 elements in the array:
99             // name, prefix, uri, value and an end pointer.
100 
101             for (int j = 0; j < 3; j++)
102                 callback->attributes[i * 5 + j] = xmlStrdup(attributes[i * 5 + j]);
103 
104             int len = attributes[i * 5 + 4] - attributes[i * 5 + 3];
105 
106             callback->attributes[i * 5 + 3] = xmlStrndup(attributes[i * 5 + 3], len);
107             callback->attributes[i * 5 + 4] = callback->attributes[i * 5 + 3] + len;
108         }
109 
110         m_callbacks.append(callback);
111     }
112 
appendEndElementNSCallback()113     void appendEndElementNSCallback()
114     {
115         PendingEndElementNSCallback* callback = new PendingEndElementNSCallback;
116 
117         m_callbacks.append(callback);
118     }
119 
appendCharactersCallback(const xmlChar * s,int len)120     void appendCharactersCallback(const xmlChar* s, int len)
121     {
122         PendingCharactersCallback* callback = new PendingCharactersCallback;
123 
124         callback->s = xmlStrndup(s, len);
125         callback->len = len;
126 
127         m_callbacks.append(callback);
128     }
129 
appendProcessingInstructionCallback(const xmlChar * target,const xmlChar * data)130     void appendProcessingInstructionCallback(const xmlChar* target, const xmlChar* data)
131     {
132         PendingProcessingInstructionCallback* callback = new PendingProcessingInstructionCallback;
133 
134         callback->target = xmlStrdup(target);
135         callback->data = xmlStrdup(data);
136 
137         m_callbacks.append(callback);
138     }
139 
appendCDATABlockCallback(const xmlChar * s,int len)140     void appendCDATABlockCallback(const xmlChar* s, int len)
141     {
142         PendingCDATABlockCallback* callback = new PendingCDATABlockCallback;
143 
144         callback->s = xmlStrndup(s, len);
145         callback->len = len;
146 
147         m_callbacks.append(callback);
148     }
149 
appendCommentCallback(const xmlChar * s)150     void appendCommentCallback(const xmlChar* s)
151     {
152         PendingCommentCallback* callback = new PendingCommentCallback;
153 
154         callback->s = xmlStrdup(s);
155 
156         m_callbacks.append(callback);
157     }
158 
appendInternalSubsetCallback(const xmlChar * name,const xmlChar * externalID,const xmlChar * systemID)159     void appendInternalSubsetCallback(const xmlChar* name, const xmlChar* externalID, const xmlChar* systemID)
160     {
161         PendingInternalSubsetCallback* callback = new PendingInternalSubsetCallback;
162 
163         callback->name = xmlStrdup(name);
164         callback->externalID = xmlStrdup(externalID);
165         callback->systemID = xmlStrdup(systemID);
166 
167         m_callbacks.append(callback);
168     }
169 
appendErrorCallback(XMLTokenizer::ErrorType type,const char * message,int lineNumber,int columnNumber)170     void appendErrorCallback(XMLTokenizer::ErrorType type, const char* message, int lineNumber, int columnNumber)
171     {
172         PendingErrorCallback* callback = new PendingErrorCallback;
173 
174         callback->message = strdup(message);
175         callback->type = type;
176         callback->lineNumber = lineNumber;
177         callback->columnNumber = columnNumber;
178 
179         m_callbacks.append(callback);
180     }
181 
callAndRemoveFirstCallback(XMLTokenizer * tokenizer)182     void callAndRemoveFirstCallback(XMLTokenizer* tokenizer)
183     {
184         OwnPtr<PendingCallback> callback(m_callbacks.first());
185         m_callbacks.removeFirst();
186         callback->call(tokenizer);
187     }
188 
isEmpty() const189     bool isEmpty() const { return m_callbacks.isEmpty(); }
190 
191 private:
192     struct PendingCallback {
~PendingCallbackWebCore::PendingCallbacks::PendingCallback193         virtual ~PendingCallback() { }
194         virtual void call(XMLTokenizer* tokenizer) = 0;
195     };
196 
197     struct PendingStartElementNSCallback : public PendingCallback {
~PendingStartElementNSCallbackWebCore::PendingCallbacks::PendingStartElementNSCallback198         virtual ~PendingStartElementNSCallback()
199         {
200             xmlFree(xmlLocalName);
201             xmlFree(xmlPrefix);
202             xmlFree(xmlURI);
203             for (int i = 0; i < nb_namespaces * 2; i++)
204                 xmlFree(namespaces[i]);
205             xmlFree(namespaces);
206             for (int i = 0; i < nb_attributes; i++)
207                 for (int j = 0; j < 4; j++)
208                     xmlFree(attributes[i * 5 + j]);
209             xmlFree(attributes);
210         }
211 
callWebCore::PendingCallbacks::PendingStartElementNSCallback212         virtual void call(XMLTokenizer* tokenizer)
213         {
214             tokenizer->startElementNs(xmlLocalName, xmlPrefix, xmlURI,
215                                       nb_namespaces, const_cast<const xmlChar**>(namespaces),
216                                       nb_attributes, nb_defaulted, const_cast<const xmlChar**>(attributes));
217         }
218 
219         xmlChar* xmlLocalName;
220         xmlChar* xmlPrefix;
221         xmlChar* xmlURI;
222         int nb_namespaces;
223         xmlChar** namespaces;
224         int nb_attributes;
225         int nb_defaulted;
226         xmlChar** attributes;
227     };
228 
229     struct PendingEndElementNSCallback : public PendingCallback {
callWebCore::PendingCallbacks::PendingEndElementNSCallback230         virtual void call(XMLTokenizer* tokenizer)
231         {
232             tokenizer->endElementNs();
233         }
234     };
235 
236     struct PendingCharactersCallback : public PendingCallback {
~PendingCharactersCallbackWebCore::PendingCallbacks::PendingCharactersCallback237         virtual ~PendingCharactersCallback()
238         {
239             xmlFree(s);
240         }
241 
callWebCore::PendingCallbacks::PendingCharactersCallback242         virtual void call(XMLTokenizer* tokenizer)
243         {
244             tokenizer->characters(s, len);
245         }
246 
247         xmlChar* s;
248         int len;
249     };
250 
251     struct PendingProcessingInstructionCallback : public PendingCallback {
~PendingProcessingInstructionCallbackWebCore::PendingCallbacks::PendingProcessingInstructionCallback252         virtual ~PendingProcessingInstructionCallback()
253         {
254             xmlFree(target);
255             xmlFree(data);
256         }
257 
callWebCore::PendingCallbacks::PendingProcessingInstructionCallback258         virtual void call(XMLTokenizer* tokenizer)
259         {
260             tokenizer->processingInstruction(target, data);
261         }
262 
263         xmlChar* target;
264         xmlChar* data;
265     };
266 
267     struct PendingCDATABlockCallback : public PendingCallback {
~PendingCDATABlockCallbackWebCore::PendingCallbacks::PendingCDATABlockCallback268         virtual ~PendingCDATABlockCallback()
269         {
270             xmlFree(s);
271         }
272 
callWebCore::PendingCallbacks::PendingCDATABlockCallback273         virtual void call(XMLTokenizer* tokenizer)
274         {
275             tokenizer->cdataBlock(s, len);
276         }
277 
278         xmlChar* s;
279         int len;
280     };
281 
282     struct PendingCommentCallback : public PendingCallback {
~PendingCommentCallbackWebCore::PendingCallbacks::PendingCommentCallback283         virtual ~PendingCommentCallback()
284         {
285             xmlFree(s);
286         }
287 
callWebCore::PendingCallbacks::PendingCommentCallback288         virtual void call(XMLTokenizer* tokenizer)
289         {
290             tokenizer->comment(s);
291         }
292 
293         xmlChar* s;
294     };
295 
296     struct PendingInternalSubsetCallback : public PendingCallback {
~PendingInternalSubsetCallbackWebCore::PendingCallbacks::PendingInternalSubsetCallback297         virtual ~PendingInternalSubsetCallback()
298         {
299             xmlFree(name);
300             xmlFree(externalID);
301             xmlFree(systemID);
302         }
303 
callWebCore::PendingCallbacks::PendingInternalSubsetCallback304         virtual void call(XMLTokenizer* tokenizer)
305         {
306             tokenizer->internalSubset(name, externalID, systemID);
307         }
308 
309         xmlChar* name;
310         xmlChar* externalID;
311         xmlChar* systemID;
312     };
313 
314     struct PendingErrorCallback: public PendingCallback {
~PendingErrorCallbackWebCore::PendingCallbacks::PendingErrorCallback315         virtual ~PendingErrorCallback()
316         {
317             free(message);
318         }
319 
callWebCore::PendingCallbacks::PendingErrorCallback320         virtual void call(XMLTokenizer* tokenizer)
321         {
322             tokenizer->handleError(type, message, lineNumber, columnNumber);
323         }
324 
325         XMLTokenizer::ErrorType type;
326         char* message;
327         int lineNumber;
328         int columnNumber;
329     };
330 
331     Deque<PendingCallback*> m_callbacks;
332 };
333 // --------------------------------
334 
335 static int globalDescriptor = 0;
336 static ThreadIdentifier libxmlLoaderThread = 0;
337 
matchFunc(const char *)338 static int matchFunc(const char*)
339 {
340     // Only match loads initiated due to uses of libxml2 from within XMLTokenizer to avoid
341     // interfering with client applications that also use libxml2.  http://bugs.webkit.org/show_bug.cgi?id=17353
342     return XMLTokenizerScope::currentDocLoader && currentThread() == libxmlLoaderThread;
343 }
344 
345 class OffsetBuffer {
346 public:
OffsetBuffer(const Vector<char> & b)347     OffsetBuffer(const Vector<char>& b) : m_buffer(b), m_currentOffset(0) { }
348 
readOutBytes(char * outputBuffer,unsigned askedToRead)349     int readOutBytes(char* outputBuffer, unsigned askedToRead)
350     {
351         unsigned bytesLeft = m_buffer.size() - m_currentOffset;
352         unsigned lenToCopy = min(askedToRead, bytesLeft);
353         if (lenToCopy) {
354             memcpy(outputBuffer, m_buffer.data() + m_currentOffset, lenToCopy);
355             m_currentOffset += lenToCopy;
356         }
357         return lenToCopy;
358     }
359 
360 private:
361     Vector<char> m_buffer;
362     unsigned m_currentOffset;
363 };
364 
shouldAllowExternalLoad(const KURL & url)365 static bool shouldAllowExternalLoad(const KURL& url)
366 {
367     String urlString = url.string();
368 
369     // On non-Windows platforms libxml asks for this URL, the
370     // "XML_XML_DEFAULT_CATALOG", on initialization.
371     if (urlString == "file:///etc/xml/catalog")
372         return false;
373 
374     // On Windows, libxml computes a URL relative to where its DLL resides.
375     if (urlString.startsWith("file:///", false) && urlString.endsWith("/etc/catalog", false))
376         return false;
377 
378     // The most common DTD.  There isn't much point in hammering www.w3c.org
379     // by requesting this URL for every XHTML document.
380     if (urlString.startsWith("http://www.w3.org/TR/xhtml", false))
381         return false;
382 
383     // Similarly, there isn't much point in requesting the SVG DTD.
384     if (urlString.startsWith("http://www.w3.org/Graphics/SVG", false))
385         return false;
386 
387     // The libxml doesn't give us a lot of context for deciding whether to
388     // allow this request.  In the worst case, this load could be for an
389     // external entity and the resulting document could simply read the
390     // retrieved content.  If we had more context, we could potentially allow
391     // the parser to load a DTD.  As things stand, we take the conservative
392     // route and allow same-origin requests only.
393     if (!XMLTokenizerScope::currentDocLoader->doc()->securityOrigin()->canRequest(url)) {
394         XMLTokenizerScope::currentDocLoader->printAccessDeniedMessage(url);
395         return false;
396     }
397 
398     return true;
399 }
400 
openFunc(const char * uri)401 static void* openFunc(const char* uri)
402 {
403     ASSERT(XMLTokenizerScope::currentDocLoader);
404     ASSERT(currentThread() == libxmlLoaderThread);
405 
406     KURL url(KURL(), uri);
407 
408     if (!shouldAllowExternalLoad(url))
409         return &globalDescriptor;
410 
411     ResourceError error;
412     ResourceResponse response;
413     Vector<char> data;
414 
415 
416     {
417         DocLoader* docLoader = XMLTokenizerScope::currentDocLoader;
418         XMLTokenizerScope scope(0);
419         // FIXME: We should restore the original global error handler as well.
420 
421         if (docLoader->frame())
422             docLoader->frame()->loader()->loadResourceSynchronously(url, AllowStoredCredentials, error, response, data);
423     }
424 
425     // We have to check the URL again after the load to catch redirects.
426     // See <https://bugs.webkit.org/show_bug.cgi?id=21963>.
427     if (!shouldAllowExternalLoad(response.url()))
428         return &globalDescriptor;
429 
430     return new OffsetBuffer(data);
431 }
432 
readFunc(void * context,char * buffer,int len)433 static int readFunc(void* context, char* buffer, int len)
434 {
435     // Do 0-byte reads in case of a null descriptor
436     if (context == &globalDescriptor)
437         return 0;
438 
439     OffsetBuffer* data = static_cast<OffsetBuffer*>(context);
440     return data->readOutBytes(buffer, len);
441 }
442 
writeFunc(void *,const char *,int)443 static int writeFunc(void*, const char*, int)
444 {
445     // Always just do 0-byte writes
446     return 0;
447 }
448 
closeFunc(void * context)449 static int closeFunc(void* context)
450 {
451     if (context != &globalDescriptor) {
452         OffsetBuffer* data = static_cast<OffsetBuffer*>(context);
453         delete data;
454     }
455     return 0;
456 }
457 
458 #if ENABLE(XSLT)
errorFunc(void *,const char *,...)459 static void errorFunc(void*, const char*, ...)
460 {
461     // FIXME: It would be nice to display error messages somewhere.
462 }
463 #endif
464 
465 static bool didInit = false;
466 
createStringParser(xmlSAXHandlerPtr handlers,void * userData)467 static xmlParserCtxtPtr createStringParser(xmlSAXHandlerPtr handlers, void* userData)
468 {
469     if (!didInit) {
470         xmlInitParser();
471         xmlRegisterInputCallbacks(matchFunc, openFunc, readFunc, closeFunc);
472         xmlRegisterOutputCallbacks(matchFunc, openFunc, writeFunc, closeFunc);
473         libxmlLoaderThread = currentThread();
474         didInit = true;
475     }
476 
477     xmlParserCtxtPtr parser = xmlCreatePushParserCtxt(handlers, 0, 0, 0, 0);
478     parser->_private = userData;
479     parser->replaceEntities = true;
480     const UChar BOM = 0xFEFF;
481     const unsigned char BOMHighByte = *reinterpret_cast<const unsigned char*>(&BOM);
482     xmlSwitchEncoding(parser, BOMHighByte == 0xFF ? XML_CHAR_ENCODING_UTF16LE : XML_CHAR_ENCODING_UTF16BE);
483 
484     return parser;
485 }
486 
487 
488 // Chunk should be encoded in UTF-8
createMemoryParser(xmlSAXHandlerPtr handlers,void * userData,const char * chunk)489 static xmlParserCtxtPtr createMemoryParser(xmlSAXHandlerPtr handlers, void* userData, const char* chunk)
490 {
491     if (!didInit) {
492         xmlInitParser();
493         xmlRegisterInputCallbacks(matchFunc, openFunc, readFunc, closeFunc);
494         xmlRegisterOutputCallbacks(matchFunc, openFunc, writeFunc, closeFunc);
495         libxmlLoaderThread = currentThread();
496         didInit = true;
497     }
498 
499     xmlParserCtxtPtr parser = xmlCreateMemoryParserCtxt(chunk, xmlStrlen((const xmlChar*)chunk));
500 
501     if (!parser)
502         return 0;
503 
504     // Copy the sax handler
505     memcpy(parser->sax, handlers, sizeof(xmlSAXHandler));
506 
507     // Set parser options.
508     // XML_PARSE_NODICT: default dictionary option.
509     // XML_PARSE_NOENT: force entities substitutions.
510     xmlCtxtUseOptions(parser, XML_PARSE_NODICT | XML_PARSE_NOENT);
511 
512     // Internal initialization
513     parser->sax2 = 1;
514     parser->instate = XML_PARSER_CONTENT; // We are parsing a CONTENT
515     parser->depth = 0;
516     parser->str_xml = xmlDictLookup(parser->dict, BAD_CAST "xml", 3);
517     parser->str_xmlns = xmlDictLookup(parser->dict, BAD_CAST "xmlns", 5);
518     parser->str_xml_ns = xmlDictLookup(parser->dict, XML_XML_NAMESPACE, 36);
519     parser->_private = userData;
520 
521     return parser;
522 }
523 
524 // --------------------------------
525 
XMLTokenizer(Document * _doc,FrameView * _view)526 XMLTokenizer::XMLTokenizer(Document* _doc, FrameView* _view)
527     : m_doc(_doc)
528     , m_view(_view)
529     , m_context(0)
530     , m_pendingCallbacks(new PendingCallbacks)
531     , m_currentNode(_doc)
532     , m_currentNodeIsReferenced(false)
533     , m_sawError(false)
534     , m_sawXSLTransform(false)
535     , m_sawFirstElement(false)
536     , m_isXHTMLDocument(false)
537 #if ENABLE(XHTMLMP)
538     , m_isXHTMLMPDocument(false)
539     , m_hasDocTypeDeclaration(false)
540 #endif
541     , m_parserPaused(false)
542     , m_requestingScript(false)
543     , m_finishCalled(false)
544     , m_errorCount(0)
545     , m_lastErrorLine(0)
546     , m_lastErrorColumn(0)
547     , m_pendingScript(0)
548     , m_scriptStartLine(0)
549     , m_parsingFragment(false)
550 {
551 }
552 
XMLTokenizer(DocumentFragment * fragment,Element * parentElement)553 XMLTokenizer::XMLTokenizer(DocumentFragment* fragment, Element* parentElement)
554     : m_doc(fragment->document())
555     , m_view(0)
556     , m_context(0)
557     , m_pendingCallbacks(new PendingCallbacks)
558     , m_currentNode(fragment)
559     , m_currentNodeIsReferenced(fragment)
560     , m_sawError(false)
561     , m_sawXSLTransform(false)
562     , m_sawFirstElement(false)
563     , m_isXHTMLDocument(false)
564 #if ENABLE(XHTMLMP)
565     , m_isXHTMLMPDocument(false)
566     , m_hasDocTypeDeclaration(false)
567 #endif
568     , m_parserPaused(false)
569     , m_requestingScript(false)
570     , m_finishCalled(false)
571     , m_errorCount(0)
572     , m_lastErrorLine(0)
573     , m_lastErrorColumn(0)
574     , m_pendingScript(0)
575     , m_scriptStartLine(0)
576     , m_parsingFragment(true)
577 {
578     if (fragment)
579         fragment->ref();
580     if (m_doc)
581         m_doc->ref();
582 
583     // Add namespaces based on the parent node
584     Vector<Element*> elemStack;
585     while (parentElement) {
586         elemStack.append(parentElement);
587 
588         Node* n = parentElement->parentNode();
589         if (!n || !n->isElementNode())
590             break;
591         parentElement = static_cast<Element*>(n);
592     }
593 
594     if (elemStack.isEmpty())
595         return;
596 
597     for (Element* element = elemStack.last(); !elemStack.isEmpty(); elemStack.removeLast()) {
598         if (NamedNodeMap* attrs = element->attributes()) {
599             for (unsigned i = 0; i < attrs->length(); i++) {
600                 Attribute* attr = attrs->attributeItem(i);
601                 if (attr->localName() == "xmlns")
602                     m_defaultNamespaceURI = attr->value();
603                 else if (attr->prefix() == "xmlns")
604                     m_prefixToNamespaceMap.set(attr->localName(), attr->value());
605             }
606         }
607     }
608 
609     // If the parent element is not in document tree, there may be no xmlns attribute; just default to the parent's namespace.
610     if (m_defaultNamespaceURI.isNull() && !parentElement->inDocument())
611         m_defaultNamespaceURI = parentElement->namespaceURI();
612 }
613 
~XMLTokenizer()614 XMLTokenizer::~XMLTokenizer()
615 {
616     setCurrentNode(0);
617     if (m_parsingFragment && m_doc)
618         m_doc->deref();
619     if (m_pendingScript)
620         m_pendingScript->removeClient(this);
621     if (m_context)
622         xmlFreeParserCtxt(m_context);
623 }
624 
doWrite(const String & parseString)625 void XMLTokenizer::doWrite(const String& parseString)
626 {
627     if (!m_context)
628         initializeParserContext();
629 
630     // libXML throws an error if you try to switch the encoding for an empty string.
631     if (parseString.length()) {
632         // Hack around libxml2's lack of encoding overide support by manually
633         // resetting the encoding to UTF-16 before every chunk.  Otherwise libxml
634         // will detect <?xml version="1.0" encoding="<encoding name>"?> blocks
635         // and switch encodings, causing the parse to fail.
636         const UChar BOM = 0xFEFF;
637         const unsigned char BOMHighByte = *reinterpret_cast<const unsigned char*>(&BOM);
638         xmlSwitchEncoding(m_context, BOMHighByte == 0xFF ? XML_CHAR_ENCODING_UTF16LE : XML_CHAR_ENCODING_UTF16BE);
639 
640         XMLTokenizerScope scope(m_doc->docLoader());
641         xmlParseChunk(m_context, reinterpret_cast<const char*>(parseString.characters()), sizeof(UChar) * parseString.length(), 0);
642     }
643 
644     if (m_doc->decoder() && m_doc->decoder()->sawError()) {
645         // If the decoder saw an error, report it as fatal (stops parsing)
646         handleError(fatal, "Encoding error", lineNumber(), columnNumber());
647     }
648 
649     return;
650 }
651 
toString(const xmlChar * str,unsigned len)652 static inline String toString(const xmlChar* str, unsigned len)
653 {
654     return UTF8Encoding().decode(reinterpret_cast<const char*>(str), len);
655 }
656 
toString(const xmlChar * str)657 static inline String toString(const xmlChar* str)
658 {
659     if (!str)
660         return String();
661 
662     return UTF8Encoding().decode(reinterpret_cast<const char*>(str), strlen(reinterpret_cast<const char*>(str)));
663 }
664 
665 struct _xmlSAX2Namespace {
666     const xmlChar* prefix;
667     const xmlChar* uri;
668 };
669 typedef struct _xmlSAX2Namespace xmlSAX2Namespace;
670 
handleElementNamespaces(Element * newElement,const xmlChar ** libxmlNamespaces,int nb_namespaces,ExceptionCode & ec)671 static inline void handleElementNamespaces(Element* newElement, const xmlChar** libxmlNamespaces, int nb_namespaces, ExceptionCode& ec)
672 {
673     xmlSAX2Namespace* namespaces = reinterpret_cast<xmlSAX2Namespace*>(libxmlNamespaces);
674     for (int i = 0; i < nb_namespaces; i++) {
675         String namespaceQName = "xmlns";
676         String namespaceURI = toString(namespaces[i].uri);
677         if (namespaces[i].prefix)
678             namespaceQName = "xmlns:" + toString(namespaces[i].prefix);
679         newElement->setAttributeNS("http://www.w3.org/2000/xmlns/", namespaceQName, namespaceURI, ec);
680         if (ec) // exception setting attributes
681             return;
682     }
683 }
684 
685 struct _xmlSAX2Attributes {
686     const xmlChar* localname;
687     const xmlChar* prefix;
688     const xmlChar* uri;
689     const xmlChar* value;
690     const xmlChar* end;
691 };
692 typedef struct _xmlSAX2Attributes xmlSAX2Attributes;
693 
handleElementAttributes(Element * newElement,const xmlChar ** libxmlAttributes,int nb_attributes,ExceptionCode & ec)694 static inline void handleElementAttributes(Element* newElement, const xmlChar** libxmlAttributes, int nb_attributes, ExceptionCode& ec)
695 {
696     xmlSAX2Attributes* attributes = reinterpret_cast<xmlSAX2Attributes*>(libxmlAttributes);
697     for (int i = 0; i < nb_attributes; i++) {
698         String attrLocalName = toString(attributes[i].localname);
699         int valueLength = (int) (attributes[i].end - attributes[i].value);
700         String attrValue = toString(attributes[i].value, valueLength);
701         String attrPrefix = toString(attributes[i].prefix);
702         String attrURI = attrPrefix.isEmpty() ? String() : toString(attributes[i].uri);
703         String attrQName = attrPrefix.isEmpty() ? attrLocalName : attrPrefix + ":" + attrLocalName;
704 
705         newElement->setAttributeNS(attrURI, attrQName, attrValue, ec);
706         if (ec) // exception setting attributes
707             return;
708     }
709 }
710 
startElementNs(const xmlChar * xmlLocalName,const xmlChar * xmlPrefix,const xmlChar * xmlURI,int nb_namespaces,const xmlChar ** libxmlNamespaces,int nb_attributes,int nb_defaulted,const xmlChar ** libxmlAttributes)711 void XMLTokenizer::startElementNs(const xmlChar* xmlLocalName, const xmlChar* xmlPrefix, const xmlChar* xmlURI, int nb_namespaces,
712                                   const xmlChar** libxmlNamespaces, int nb_attributes, int nb_defaulted, const xmlChar** libxmlAttributes)
713 {
714     if (m_parserStopped)
715         return;
716 
717     if (m_parserPaused) {
718         m_pendingCallbacks->appendStartElementNSCallback(xmlLocalName, xmlPrefix, xmlURI, nb_namespaces, libxmlNamespaces,
719                                                          nb_attributes, nb_defaulted, libxmlAttributes);
720         return;
721     }
722 
723 #if ENABLE(XHTMLMP)
724     // check if the DOCTYPE Declaration of XHTMLMP document exists
725     if (!m_hasDocTypeDeclaration && m_doc->isXHTMLMPDocument()) {
726         handleError(fatal, "DOCTYPE declaration lost.", lineNumber(), columnNumber());
727         return;
728     }
729 #endif
730 
731     exitText();
732 
733     String localName = toString(xmlLocalName);
734     String uri = toString(xmlURI);
735     String prefix = toString(xmlPrefix);
736 
737     if (m_parsingFragment && uri.isNull()) {
738         if (!prefix.isNull())
739             uri = m_prefixToNamespaceMap.get(prefix);
740         else
741             uri = m_defaultNamespaceURI;
742     }
743 
744 #if ENABLE(XHTMLMP)
745     if (!m_sawFirstElement && isXHTMLMPDocument()) {
746         // As per the section 7.1 of OMA-WAP-XHTMLMP-V1_1-20061020-A.pdf,
747         // we should make sure that the root element MUST be 'html' and
748         // ensure the name of the default namespace on the root elment 'html'
749         // MUST be 'http://www.w3.org/1999/xhtml'
750         if (localName != HTMLNames::htmlTag.localName()) {
751             handleError(fatal, "XHTMLMP document expects 'html' as root element.", lineNumber(), columnNumber());
752             return;
753         }
754 
755         if (uri.isNull()) {
756             m_defaultNamespaceURI = HTMLNames::xhtmlNamespaceURI;
757             uri = m_defaultNamespaceURI;
758         }
759     }
760 #endif
761 
762     bool isFirstElement = !m_sawFirstElement;
763     m_sawFirstElement = true;
764 
765     QualifiedName qName(prefix, localName, uri);
766     RefPtr<Element> newElement = m_doc->createElement(qName, true);
767     if (!newElement) {
768         stopParsing();
769         return;
770     }
771 
772     ExceptionCode ec = 0;
773     handleElementNamespaces(newElement.get(), libxmlNamespaces, nb_namespaces, ec);
774     if (ec) {
775         stopParsing();
776         return;
777     }
778 
779     ScriptController* jsProxy = m_doc->frame() ? m_doc->frame()->script() : 0;
780     if (jsProxy && m_doc->frame()->script()->isEnabled())
781         jsProxy->setEventHandlerLineNumber(lineNumber());
782 
783     handleElementAttributes(newElement.get(), libxmlAttributes, nb_attributes, ec);
784     if (ec) {
785         stopParsing();
786         return;
787     }
788 
789     if (jsProxy)
790         jsProxy->setEventHandlerLineNumber(0);
791 
792     newElement->beginParsingChildren();
793 
794     ScriptElement* scriptElement = toScriptElement(newElement.get());
795     if (scriptElement)
796         m_scriptStartLine = lineNumber();
797 
798     if (!m_currentNode->addChild(newElement.get())) {
799         stopParsing();
800         return;
801     }
802 
803     setCurrentNode(newElement.get());
804     if (m_view && !newElement->attached())
805         newElement->attach();
806 
807     if (isFirstElement && m_doc->frame())
808         m_doc->frame()->loader()->dispatchDocumentElementAvailable();
809 }
810 
endElementNs()811 void XMLTokenizer::endElementNs()
812 {
813     if (m_parserStopped)
814         return;
815 
816     if (m_parserPaused) {
817         m_pendingCallbacks->appendEndElementNSCallback();
818         return;
819     }
820 
821     exitText();
822 
823     Node* n = m_currentNode;
824     RefPtr<Node> parent = n->parentNode();
825     n->finishParsingChildren();
826 
827     if (!n->isElementNode() || !m_view) {
828         setCurrentNode(parent.get());
829         return;
830     }
831 
832     Element* element = static_cast<Element*>(n);
833     ScriptElement* scriptElement = toScriptElement(element);
834     if (!scriptElement) {
835         setCurrentNode(parent.get());
836         return;
837     }
838 
839     // don't load external scripts for standalone documents (for now)
840     ASSERT(!m_pendingScript);
841     m_requestingScript = true;
842 
843 #if ENABLE(XHTMLMP)
844     if (!scriptElement->shouldExecuteAsJavaScript())
845         m_doc->setShouldProcessNoscriptElement(true);
846     else
847 #endif
848     {
849         String scriptHref = scriptElement->sourceAttributeValue();
850         if (!scriptHref.isEmpty()) {
851             // we have a src attribute
852             String scriptCharset = scriptElement->scriptCharset();
853             if ((m_pendingScript = m_doc->docLoader()->requestScript(scriptHref, scriptCharset))) {
854                 m_scriptElement = element;
855                 m_pendingScript->addClient(this);
856 
857                 // m_pendingScript will be 0 if script was already loaded and ref() executed it
858                 if (m_pendingScript)
859                     pauseParsing();
860             } else
861                 m_scriptElement = 0;
862         } else
863             m_view->frame()->loader()->executeScript(ScriptSourceCode(scriptElement->scriptContent(), m_doc->url(), m_scriptStartLine));
864     }
865     m_requestingScript = false;
866     setCurrentNode(parent.get());
867 }
868 
characters(const xmlChar * s,int len)869 void XMLTokenizer::characters(const xmlChar* s, int len)
870 {
871     if (m_parserStopped)
872         return;
873 
874     if (m_parserPaused) {
875         m_pendingCallbacks->appendCharactersCallback(s, len);
876         return;
877     }
878 
879     if (m_currentNode->isTextNode() || enterText())
880         m_bufferedText.append(s, len);
881 }
882 
error(ErrorType type,const char * message,va_list args)883 void XMLTokenizer::error(ErrorType type, const char* message, va_list args)
884 {
885     if (m_parserStopped)
886         return;
887 
888 #if PLATFORM(WIN_OS)
889     char m[1024];
890     vsnprintf(m, sizeof(m) - 1, message, args);
891 #else
892     char* m;
893     if (vasprintf(&m, message, args) == -1)
894         return;
895 #endif
896 
897     if (m_parserPaused)
898         m_pendingCallbacks->appendErrorCallback(type, m, lineNumber(), columnNumber());
899     else
900         handleError(type, m, lineNumber(), columnNumber());
901 
902 #if !PLATFORM(WIN_OS)
903     free(m);
904 #endif
905 }
906 
processingInstruction(const xmlChar * target,const xmlChar * data)907 void XMLTokenizer::processingInstruction(const xmlChar* target, const xmlChar* data)
908 {
909     if (m_parserStopped)
910         return;
911 
912     if (m_parserPaused) {
913         m_pendingCallbacks->appendProcessingInstructionCallback(target, data);
914         return;
915     }
916 
917     exitText();
918 
919     // ### handle exceptions
920     int exception = 0;
921     RefPtr<ProcessingInstruction> pi = m_doc->createProcessingInstruction(
922         toString(target), toString(data), exception);
923     if (exception)
924         return;
925 
926     pi->setCreatedByParser(true);
927 
928     if (!m_currentNode->addChild(pi.get()))
929         return;
930     if (m_view && !pi->attached())
931         pi->attach();
932 
933     pi->finishParsingChildren();
934 
935 #if ENABLE(XSLT)
936     m_sawXSLTransform = !m_sawFirstElement && pi->isXSL();
937     if (m_sawXSLTransform && !m_doc->transformSourceDocument())
938         stopParsing();
939 #endif
940 }
941 
cdataBlock(const xmlChar * s,int len)942 void XMLTokenizer::cdataBlock(const xmlChar* s, int len)
943 {
944     if (m_parserStopped)
945         return;
946 
947     if (m_parserPaused) {
948         m_pendingCallbacks->appendCDATABlockCallback(s, len);
949         return;
950     }
951 
952     exitText();
953 
954     RefPtr<Node> newNode = new CDATASection(m_doc, toString(s, len));
955     if (!m_currentNode->addChild(newNode.get()))
956         return;
957     if (m_view && !newNode->attached())
958         newNode->attach();
959 }
960 
comment(const xmlChar * s)961 void XMLTokenizer::comment(const xmlChar* s)
962 {
963     if (m_parserStopped)
964         return;
965 
966     if (m_parserPaused) {
967         m_pendingCallbacks->appendCommentCallback(s);
968         return;
969     }
970 
971     exitText();
972 
973     RefPtr<Node> newNode = new Comment(m_doc, toString(s));
974     m_currentNode->addChild(newNode.get());
975     if (m_view && !newNode->attached())
976         newNode->attach();
977 }
978 
startDocument(const xmlChar * version,const xmlChar * encoding,int standalone)979 void XMLTokenizer::startDocument(const xmlChar* version, const xmlChar* encoding, int standalone)
980 {
981     ExceptionCode ec = 0;
982 
983     if (version)
984         m_doc->setXMLVersion(toString(version), ec);
985     m_doc->setXMLStandalone(standalone == 1, ec); // possible values are 0, 1, and -1
986     if (encoding)
987         m_doc->setXMLEncoding(toString(encoding));
988 }
989 
endDocument()990 void XMLTokenizer::endDocument()
991 {
992     exitText();
993 #if ENABLE(XHTMLMP)
994     m_hasDocTypeDeclaration = false;
995 #endif
996 }
997 
internalSubset(const xmlChar * name,const xmlChar * externalID,const xmlChar * systemID)998 void XMLTokenizer::internalSubset(const xmlChar* name, const xmlChar* externalID, const xmlChar* systemID)
999 {
1000     if (m_parserStopped)
1001         return;
1002 
1003     if (m_parserPaused) {
1004         m_pendingCallbacks->appendInternalSubsetCallback(name, externalID, systemID);
1005         return;
1006     }
1007 
1008     if (m_doc) {
1009 #if ENABLE(WML) || ENABLE(XHTMLMP)
1010         String extId = toString(externalID);
1011 #endif
1012 #if ENABLE(WML)
1013         if (isWMLDocument()
1014             && extId != "-//WAPFORUM//DTD WML 1.3//EN"
1015             && extId != "-//WAPFORUM//DTD WML 1.2//EN"
1016             && extId != "-//WAPFORUM//DTD WML 1.1//EN"
1017             && extId != "-//WAPFORUM//DTD WML 1.0//EN")
1018             handleError(fatal, "Invalid DTD Public ID", lineNumber(), columnNumber());
1019 #endif
1020 #if ENABLE(XHTMLMP)
1021         String dtdName = toString(name);
1022         if (extId == "-//WAPFORUM//DTD XHTML Mobile 1.0//EN"
1023             || extId == "-//WAPFORUM//DTD XHTML Mobile 1.1//EN") {
1024             if (dtdName != HTMLNames::htmlTag.localName()) {
1025                 handleError(fatal, "Invalid DOCTYPE declaration, expected 'html' as root element.", lineNumber(), columnNumber());
1026                 return;
1027             }
1028 
1029             if (m_doc->isXHTMLMPDocument())
1030                 setIsXHTMLMPDocument(true);
1031             else
1032                 setIsXHTMLDocument(true);
1033 
1034             m_hasDocTypeDeclaration = true;
1035         }
1036 #endif
1037 
1038 #if ENABLE(XHTMLMP)
1039         m_doc->addChild(DocumentType::create(m_doc, dtdName, extId, toString(systemID)));
1040 #elif ENABLE(WML)
1041         m_doc->addChild(DocumentType::create(m_doc, toString(name), extId, toString(systemID)));
1042 #else
1043         m_doc->addChild(DocumentType::create(m_doc, toString(name), toString(externalID), toString(systemID)));
1044 #endif
1045     }
1046 }
1047 
getTokenizer(void * closure)1048 static inline XMLTokenizer* getTokenizer(void* closure)
1049 {
1050     xmlParserCtxtPtr ctxt = static_cast<xmlParserCtxtPtr>(closure);
1051     return static_cast<XMLTokenizer*>(ctxt->_private);
1052 }
1053 
1054 // This is a hack around http://bugzilla.gnome.org/show_bug.cgi?id=159219
1055 // Otherwise libxml seems to call all the SAX callbacks twice for any replaced entity.
hackAroundLibXMLEntityBug(void * closure)1056 static inline bool hackAroundLibXMLEntityBug(void* closure)
1057 {
1058 #if LIBXML_VERSION >= 20627
1059     UNUSED_PARAM(closure);
1060 
1061     // This bug has been fixed in libxml 2.6.27.
1062     return false;
1063 #else
1064     return static_cast<xmlParserCtxtPtr>(closure)->node;
1065 #endif
1066 }
1067 
startElementNsHandler(void * closure,const xmlChar * localname,const xmlChar * prefix,const xmlChar * uri,int nb_namespaces,const xmlChar ** namespaces,int nb_attributes,int nb_defaulted,const xmlChar ** libxmlAttributes)1068 static void startElementNsHandler(void* closure, const xmlChar* localname, const xmlChar* prefix, const xmlChar* uri, int nb_namespaces, const xmlChar** namespaces, int nb_attributes, int nb_defaulted, const xmlChar** libxmlAttributes)
1069 {
1070     if (hackAroundLibXMLEntityBug(closure))
1071         return;
1072 
1073     getTokenizer(closure)->startElementNs(localname, prefix, uri, nb_namespaces, namespaces, nb_attributes, nb_defaulted, libxmlAttributes);
1074 }
1075 
endElementNsHandler(void * closure,const xmlChar *,const xmlChar *,const xmlChar *)1076 static void endElementNsHandler(void* closure, const xmlChar*, const xmlChar*, const xmlChar*)
1077 {
1078     if (hackAroundLibXMLEntityBug(closure))
1079         return;
1080 
1081     getTokenizer(closure)->endElementNs();
1082 }
1083 
charactersHandler(void * closure,const xmlChar * s,int len)1084 static void charactersHandler(void* closure, const xmlChar* s, int len)
1085 {
1086     if (hackAroundLibXMLEntityBug(closure))
1087         return;
1088 
1089     getTokenizer(closure)->characters(s, len);
1090 }
1091 
processingInstructionHandler(void * closure,const xmlChar * target,const xmlChar * data)1092 static void processingInstructionHandler(void* closure, const xmlChar* target, const xmlChar* data)
1093 {
1094     if (hackAroundLibXMLEntityBug(closure))
1095         return;
1096 
1097     getTokenizer(closure)->processingInstruction(target, data);
1098 }
1099 
cdataBlockHandler(void * closure,const xmlChar * s,int len)1100 static void cdataBlockHandler(void* closure, const xmlChar* s, int len)
1101 {
1102     if (hackAroundLibXMLEntityBug(closure))
1103         return;
1104 
1105     getTokenizer(closure)->cdataBlock(s, len);
1106 }
1107 
commentHandler(void * closure,const xmlChar * comment)1108 static void commentHandler(void* closure, const xmlChar* comment)
1109 {
1110     if (hackAroundLibXMLEntityBug(closure))
1111         return;
1112 
1113     getTokenizer(closure)->comment(comment);
1114 }
1115 
1116 WTF_ATTRIBUTE_PRINTF(2, 3)
warningHandler(void * closure,const char * message,...)1117 static void warningHandler(void* closure, const char* message, ...)
1118 {
1119     va_list args;
1120     va_start(args, message);
1121     getTokenizer(closure)->error(XMLTokenizer::warning, message, args);
1122     va_end(args);
1123 }
1124 
1125 WTF_ATTRIBUTE_PRINTF(2, 3)
fatalErrorHandler(void * closure,const char * message,...)1126 static void fatalErrorHandler(void* closure, const char* message, ...)
1127 {
1128     va_list args;
1129     va_start(args, message);
1130     getTokenizer(closure)->error(XMLTokenizer::fatal, message, args);
1131     va_end(args);
1132 }
1133 
1134 WTF_ATTRIBUTE_PRINTF(2, 3)
normalErrorHandler(void * closure,const char * message,...)1135 static void normalErrorHandler(void* closure, const char* message, ...)
1136 {
1137     va_list args;
1138     va_start(args, message);
1139     getTokenizer(closure)->error(XMLTokenizer::nonFatal, message, args);
1140     va_end(args);
1141 }
1142 
1143 // Using a static entity and marking it XML_INTERNAL_PREDEFINED_ENTITY is
1144 // a hack to avoid malloc/free. Using a global variable like this could cause trouble
1145 // if libxml implementation details were to change
1146 static xmlChar sharedXHTMLEntityResult[5] = {0, 0, 0, 0, 0};
1147 
sharedXHTMLEntity()1148 static xmlEntityPtr sharedXHTMLEntity()
1149 {
1150     static xmlEntity entity;
1151     if (!entity.type) {
1152         entity.type = XML_ENTITY_DECL;
1153         entity.orig = sharedXHTMLEntityResult;
1154         entity.content = sharedXHTMLEntityResult;
1155         entity.etype = XML_INTERNAL_PREDEFINED_ENTITY;
1156     }
1157     return &entity;
1158 }
1159 
getXHTMLEntity(const xmlChar * name)1160 static xmlEntityPtr getXHTMLEntity(const xmlChar* name)
1161 {
1162     UChar c = decodeNamedEntity(reinterpret_cast<const char*>(name));
1163     if (!c)
1164         return 0;
1165 
1166     CString value = String(&c, 1).utf8();
1167     ASSERT(value.length() < 5);
1168     xmlEntityPtr entity = sharedXHTMLEntity();
1169     entity->length = value.length();
1170     entity->name = name;
1171     memcpy(sharedXHTMLEntityResult, value.data(), entity->length + 1);
1172 
1173     return entity;
1174 }
1175 
getEntityHandler(void * closure,const xmlChar * name)1176 static xmlEntityPtr getEntityHandler(void* closure, const xmlChar* name)
1177 {
1178     xmlParserCtxtPtr ctxt = static_cast<xmlParserCtxtPtr>(closure);
1179     xmlEntityPtr ent = xmlGetPredefinedEntity(name);
1180     if (ent) {
1181         ent->etype = XML_INTERNAL_PREDEFINED_ENTITY;
1182         return ent;
1183     }
1184 
1185     ent = xmlGetDocEntity(ctxt->myDoc, name);
1186     if (!ent && (getTokenizer(closure)->isXHTMLDocument()
1187 #if ENABLE(XHTMLMP)
1188                  || getTokenizer(closure)->isXHTMLMPDocument()
1189 #endif
1190 #if ENABLE(WML)
1191                  || getTokenizer(closure)->isWMLDocument()
1192 #endif
1193        )) {
1194         ent = getXHTMLEntity(name);
1195         if (ent)
1196             ent->etype = XML_INTERNAL_GENERAL_ENTITY;
1197     }
1198 
1199     return ent;
1200 }
1201 
startDocumentHandler(void * closure)1202 static void startDocumentHandler(void* closure)
1203 {
1204     xmlParserCtxt* ctxt = static_cast<xmlParserCtxt*>(closure);
1205     getTokenizer(closure)->startDocument(ctxt->version, ctxt->encoding, ctxt->standalone);
1206     xmlSAX2StartDocument(closure);
1207 }
1208 
endDocumentHandler(void * closure)1209 static void endDocumentHandler(void* closure)
1210 {
1211     getTokenizer(closure)->endDocument();
1212     xmlSAX2EndDocument(closure);
1213 }
1214 
internalSubsetHandler(void * closure,const xmlChar * name,const xmlChar * externalID,const xmlChar * systemID)1215 static void internalSubsetHandler(void* closure, const xmlChar* name, const xmlChar* externalID, const xmlChar* systemID)
1216 {
1217     getTokenizer(closure)->internalSubset(name, externalID, systemID);
1218     xmlSAX2InternalSubset(closure, name, externalID, systemID);
1219 }
1220 
externalSubsetHandler(void * closure,const xmlChar *,const xmlChar * externalId,const xmlChar *)1221 static void externalSubsetHandler(void* closure, const xmlChar*, const xmlChar* externalId, const xmlChar*)
1222 {
1223     String extId = toString(externalId);
1224     if ((extId == "-//W3C//DTD XHTML 1.0 Transitional//EN")
1225         || (extId == "-//W3C//DTD XHTML 1.1//EN")
1226         || (extId == "-//W3C//DTD XHTML 1.0 Strict//EN")
1227         || (extId == "-//W3C//DTD XHTML 1.0 Frameset//EN")
1228         || (extId == "-//W3C//DTD XHTML Basic 1.0//EN")
1229         || (extId == "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN")
1230         || (extId == "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN")
1231 #if !ENABLE(XHTMLMP)
1232         || (extId == "-//WAPFORUM//DTD XHTML Mobile 1.0//EN")
1233 #endif
1234        )
1235         getTokenizer(closure)->setIsXHTMLDocument(true); // controls if we replace entities or not.
1236 }
1237 
ignorableWhitespaceHandler(void *,const xmlChar *,int)1238 static void ignorableWhitespaceHandler(void*, const xmlChar*, int)
1239 {
1240     // nothing to do, but we need this to work around a crasher
1241     // http://bugzilla.gnome.org/show_bug.cgi?id=172255
1242     // http://bugs.webkit.org/show_bug.cgi?id=5792
1243 }
1244 
initializeParserContext(const char * chunk)1245 void XMLTokenizer::initializeParserContext(const char* chunk)
1246 {
1247     xmlSAXHandler sax;
1248     memset(&sax, 0, sizeof(sax));
1249 
1250     sax.error = normalErrorHandler;
1251     sax.fatalError = fatalErrorHandler;
1252     sax.characters = charactersHandler;
1253     sax.processingInstruction = processingInstructionHandler;
1254     sax.cdataBlock = cdataBlockHandler;
1255     sax.comment = commentHandler;
1256     sax.warning = warningHandler;
1257     sax.startElementNs = startElementNsHandler;
1258     sax.endElementNs = endElementNsHandler;
1259     sax.getEntity = getEntityHandler;
1260     sax.startDocument = startDocumentHandler;
1261     sax.endDocument = endDocumentHandler;
1262     sax.internalSubset = internalSubsetHandler;
1263     sax.externalSubset = externalSubsetHandler;
1264     sax.ignorableWhitespace = ignorableWhitespaceHandler;
1265     sax.entityDecl = xmlSAX2EntityDecl;
1266     sax.initialized = XML_SAX2_MAGIC;
1267     m_parserStopped = false;
1268     m_sawError = false;
1269     m_sawXSLTransform = false;
1270     m_sawFirstElement = false;
1271 
1272     XMLTokenizerScope scope(m_doc->docLoader());
1273     if (m_parsingFragment)
1274         m_context = createMemoryParser(&sax, this, chunk);
1275     else
1276         m_context = createStringParser(&sax, this);
1277 }
1278 
doEnd()1279 void XMLTokenizer::doEnd()
1280 {
1281 #if ENABLE(XSLT)
1282     if (m_sawXSLTransform) {
1283         m_doc->setTransformSource(xmlDocPtrForString(m_doc->docLoader(), m_originalSourceForTransform, m_doc->url().string()));
1284 
1285         m_doc->setParsing(false); // Make the doc think it's done, so it will apply xsl sheets.
1286         m_doc->updateStyleSelector();
1287         m_doc->setParsing(true);
1288         m_parserStopped = true;
1289     }
1290 #endif
1291 
1292     if (m_context) {
1293         // Tell libxml we're done.
1294         {
1295             XMLTokenizerScope scope(m_doc->docLoader());
1296             xmlParseChunk(m_context, 0, 0, 1);
1297         }
1298 
1299         if (m_context->myDoc)
1300             xmlFreeDoc(m_context->myDoc);
1301         xmlFreeParserCtxt(m_context);
1302         m_context = 0;
1303     }
1304 }
1305 
1306 #if ENABLE(XSLT)
xmlDocPtrForString(DocLoader * docLoader,const String & source,const String & url)1307 void* xmlDocPtrForString(DocLoader* docLoader, const String& source, const String& url)
1308 {
1309     if (source.isEmpty())
1310         return 0;
1311 
1312     // Parse in a single chunk into an xmlDocPtr
1313     // FIXME: Hook up error handlers so that a failure to parse the main document results in
1314     // good error messages.
1315     const UChar BOM = 0xFEFF;
1316     const unsigned char BOMHighByte = *reinterpret_cast<const unsigned char*>(&BOM);
1317 
1318     XMLTokenizerScope scope(docLoader, errorFunc, 0);
1319     xmlDocPtr sourceDoc = xmlReadMemory(reinterpret_cast<const char*>(source.characters()),
1320                                         source.length() * sizeof(UChar),
1321                                         url.latin1().data(),
1322                                         BOMHighByte == 0xFF ? "UTF-16LE" : "UTF-16BE",
1323                                         XSLT_PARSE_OPTIONS);
1324     return sourceDoc;
1325 }
1326 #endif
1327 
lineNumber() const1328 int XMLTokenizer::lineNumber() const
1329 {
1330     return m_context ? m_context->input->line : 1;
1331 }
1332 
columnNumber() const1333 int XMLTokenizer::columnNumber() const
1334 {
1335     return m_context ? m_context->input->col : 1;
1336 }
1337 
stopParsing()1338 void XMLTokenizer::stopParsing()
1339 {
1340     Tokenizer::stopParsing();
1341     xmlStopParser(m_context);
1342 }
1343 
resumeParsing()1344 void XMLTokenizer::resumeParsing()
1345 {
1346     ASSERT(m_parserPaused);
1347 
1348     m_parserPaused = false;
1349 
1350     // First, execute any pending callbacks
1351     while (!m_pendingCallbacks->isEmpty()) {
1352         m_pendingCallbacks->callAndRemoveFirstCallback(this);
1353 
1354         // A callback paused the parser
1355         if (m_parserPaused)
1356             return;
1357     }
1358 
1359     // Then, write any pending data
1360     SegmentedString rest = m_pendingSrc;
1361     m_pendingSrc.clear();
1362     write(rest, false);
1363 
1364     // Finally, if finish() has been called and write() didn't result
1365     // in any further callbacks being queued, call end()
1366     if (m_finishCalled && m_pendingCallbacks->isEmpty())
1367         end();
1368 }
1369 
parseXMLDocumentFragment(const String & chunk,DocumentFragment * fragment,Element * parent)1370 bool parseXMLDocumentFragment(const String& chunk, DocumentFragment* fragment, Element* parent)
1371 {
1372     if (!chunk.length())
1373         return true;
1374 
1375     XMLTokenizer tokenizer(fragment, parent);
1376 
1377     CString chunkAsUtf8 = chunk.utf8();
1378     tokenizer.initializeParserContext(chunkAsUtf8.data());
1379 
1380     xmlParseContent(tokenizer.m_context);
1381 
1382     tokenizer.endDocument();
1383 
1384     // Check if all the chunk has been processed.
1385     long bytesProcessed = xmlByteConsumed(tokenizer.m_context);
1386     if (bytesProcessed == -1 || ((unsigned long)bytesProcessed) != chunkAsUtf8.length())
1387         return false;
1388 
1389     // No error if the chunk is well formed or it is not but we have no error.
1390     return tokenizer.m_context->wellFormed || xmlCtxtGetLastError(tokenizer.m_context) == 0;
1391 }
1392 
1393 // --------------------------------
1394 
1395 struct AttributeParseState {
1396     HashMap<String, String> attributes;
1397     bool gotAttributes;
1398 };
1399 
attributesStartElementNsHandler(void * closure,const xmlChar * xmlLocalName,const xmlChar *,const xmlChar *,int,const xmlChar **,int nb_attributes,int,const xmlChar ** libxmlAttributes)1400 static void attributesStartElementNsHandler(void* closure, const xmlChar* xmlLocalName, const xmlChar* /*xmlPrefix*/,
1401                                             const xmlChar* /*xmlURI*/, int /*nb_namespaces*/, const xmlChar** /*namespaces*/,
1402                                             int nb_attributes, int /*nb_defaulted*/, const xmlChar** libxmlAttributes)
1403 {
1404     if (strcmp(reinterpret_cast<const char*>(xmlLocalName), "attrs") != 0)
1405         return;
1406 
1407     xmlParserCtxtPtr ctxt = static_cast<xmlParserCtxtPtr>(closure);
1408     AttributeParseState* state = static_cast<AttributeParseState*>(ctxt->_private);
1409 
1410     state->gotAttributes = true;
1411 
1412     xmlSAX2Attributes* attributes = reinterpret_cast<xmlSAX2Attributes*>(libxmlAttributes);
1413     for (int i = 0; i < nb_attributes; i++) {
1414         String attrLocalName = toString(attributes[i].localname);
1415         int valueLength = (int) (attributes[i].end - attributes[i].value);
1416         String attrValue = toString(attributes[i].value, valueLength);
1417         String attrPrefix = toString(attributes[i].prefix);
1418         String attrQName = attrPrefix.isEmpty() ? attrLocalName : attrPrefix + ":" + attrLocalName;
1419 
1420         state->attributes.set(attrQName, attrValue);
1421     }
1422 }
1423 
parseAttributes(const String & string,bool & attrsOK)1424 HashMap<String, String> parseAttributes(const String& string, bool& attrsOK)
1425 {
1426     AttributeParseState state;
1427     state.gotAttributes = false;
1428 
1429     xmlSAXHandler sax;
1430     memset(&sax, 0, sizeof(sax));
1431     sax.startElementNs = attributesStartElementNsHandler;
1432     sax.initialized = XML_SAX2_MAGIC;
1433     xmlParserCtxtPtr parser = createStringParser(&sax, &state);
1434     String parseString = "<?xml version=\"1.0\"?><attrs " + string + " />";
1435     xmlParseChunk(parser, reinterpret_cast<const char*>(parseString.characters()), parseString.length() * sizeof(UChar), 1);
1436     if (parser->myDoc)
1437         xmlFreeDoc(parser->myDoc);
1438     xmlFreeParserCtxt(parser);
1439     attrsOK = state.gotAttributes;
1440     return state.attributes;
1441 }
1442 
1443 }
1444