• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Helper classes for tests."""
2
3import copy
4import functools
5import unittest
6from unittest import TestCase
7from bs4 import BeautifulSoup
8from bs4.element import (
9    CharsetMetaAttributeValue,
10    Comment,
11    ContentMetaAttributeValue,
12    Doctype,
13    SoupStrainer,
14)
15
16from bs4.builder import HTMLParserTreeBuilder
17default_builder = HTMLParserTreeBuilder
18
19
20class SoupTest(unittest.TestCase):
21
22    @property
23    def default_builder(self):
24        return default_builder()
25
26    def soup(self, markup, **kwargs):
27        """Build a Beautiful Soup object from markup."""
28        builder = kwargs.pop('builder', self.default_builder)
29        return BeautifulSoup(markup, builder=builder, **kwargs)
30
31    def document_for(self, markup):
32        """Turn an HTML fragment into a document.
33
34        The details depend on the builder.
35        """
36        return self.default_builder.test_fragment_to_document(markup)
37
38    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
39        builder = self.default_builder
40        obj = BeautifulSoup(to_parse, builder=builder)
41        if compare_parsed_to is None:
42            compare_parsed_to = to_parse
43
44        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
45
46
47class HTMLTreeBuilderSmokeTest(object):
48
49    """A basic test of a treebuilder's competence.
50
51    Any HTML treebuilder, present or future, should be able to pass
52    these tests. With invalid markup, there's room for interpretation,
53    and different parsers can handle it differently. But with the
54    markup in these tests, there's not much room for interpretation.
55    """
56
57    def assertDoctypeHandled(self, doctype_fragment):
58        """Assert that a given doctype string is handled correctly."""
59        doctype_str, soup = self._document_with_doctype(doctype_fragment)
60
61        # Make sure a Doctype object was created.
62        doctype = soup.contents[0]
63        self.assertEqual(doctype.__class__, Doctype)
64        self.assertEqual(doctype, doctype_fragment)
65        self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
66
67        # Make sure that the doctype was correctly associated with the
68        # parse tree and that the rest of the document parsed.
69        self.assertEqual(soup.p.contents[0], 'foo')
70
71    def _document_with_doctype(self, doctype_fragment):
72        """Generate and parse a document with the given doctype."""
73        doctype = '<!DOCTYPE %s>' % doctype_fragment
74        markup = doctype + '\n<p>foo</p>'
75        soup = self.soup(markup)
76        return doctype, soup
77
78    def test_normal_doctypes(self):
79        """Make sure normal, everyday HTML doctypes are handled correctly."""
80        self.assertDoctypeHandled("html")
81        self.assertDoctypeHandled(
82            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
83
84    def test_empty_doctype(self):
85        soup = self.soup("<!DOCTYPE>")
86        doctype = soup.contents[0]
87        self.assertEqual("", doctype.strip())
88
89    def test_public_doctype_with_url(self):
90        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
91        self.assertDoctypeHandled(doctype)
92
93    def test_system_doctype(self):
94        self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
95
96    def test_namespaced_system_doctype(self):
97        # We can handle a namespaced doctype with a system ID.
98        self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
99
100    def test_namespaced_public_doctype(self):
101        # Test a namespaced doctype with a public id.
102        self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
103
104    def test_real_xhtml_document(self):
105        """A real XHTML document should come out more or less the same as it went in."""
106        markup = b"""<?xml version="1.0" encoding="utf-8"?>
107<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
108<html xmlns="http://www.w3.org/1999/xhtml">
109<head><title>Hello.</title></head>
110<body>Goodbye.</body>
111</html>"""
112        soup = self.soup(markup)
113        self.assertEqual(
114            soup.encode("utf-8").replace(b"\n", b""),
115            markup.replace(b"\n", b""))
116
117    def test_deepcopy(self):
118        """Make sure you can copy the tree builder.
119
120        This is important because the builder is part of a
121        BeautifulSoup object, and we want to be able to copy that.
122        """
123        copy.deepcopy(self.default_builder)
124
125    def test_p_tag_is_never_empty_element(self):
126        """A <p> tag is never designated as an empty-element tag.
127
128        Even if the markup shows it as an empty-element tag, it
129        shouldn't be presented that way.
130        """
131        soup = self.soup("<p/>")
132        self.assertFalse(soup.p.is_empty_element)
133        self.assertEqual(str(soup.p), "<p></p>")
134
135    def test_unclosed_tags_get_closed(self):
136        """A tag that's not closed by the end of the document should be closed.
137
138        This applies to all tags except empty-element tags.
139        """
140        self.assertSoupEquals("<p>", "<p></p>")
141        self.assertSoupEquals("<b>", "<b></b>")
142
143        self.assertSoupEquals("<br>", "<br/>")
144
145    def test_br_is_always_empty_element_tag(self):
146        """A <br> tag is designated as an empty-element tag.
147
148        Some parsers treat <br></br> as one <br/> tag, some parsers as
149        two tags, but it should always be an empty-element tag.
150        """
151        soup = self.soup("<br></br>")
152        self.assertTrue(soup.br.is_empty_element)
153        self.assertEqual(str(soup.br), "<br/>")
154
155    def test_nested_formatting_elements(self):
156        self.assertSoupEquals("<em><em></em></em>")
157
158    def test_comment(self):
159        # Comments are represented as Comment objects.
160        markup = "<p>foo<!--foobar-->baz</p>"
161        self.assertSoupEquals(markup)
162
163        soup = self.soup(markup)
164        comment = soup.find(text="foobar")
165        self.assertEqual(comment.__class__, Comment)
166
167        # The comment is properly integrated into the tree.
168        foo = soup.find(text="foo")
169        self.assertEqual(comment, foo.next_element)
170        baz = soup.find(text="baz")
171        self.assertEqual(comment, baz.previous_element)
172
173    def test_preserved_whitespace_in_pre_and_textarea(self):
174        """Whitespace must be preserved in <pre> and <textarea> tags."""
175        self.assertSoupEquals("<pre>   </pre>")
176        self.assertSoupEquals("<textarea> woo  </textarea>")
177
178    def test_nested_inline_elements(self):
179        """Inline elements can be nested indefinitely."""
180        b_tag = "<b>Inside a B tag</b>"
181        self.assertSoupEquals(b_tag)
182
183        nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
184        self.assertSoupEquals(nested_b_tag)
185
186        double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
187        self.assertSoupEquals(nested_b_tag)
188
189    def test_nested_block_level_elements(self):
190        """Block elements can be nested."""
191        soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
192        blockquote = soup.blockquote
193        self.assertEqual(blockquote.p.b.string, 'Foo')
194        self.assertEqual(blockquote.b.string, 'Foo')
195
196    def test_correctly_nested_tables(self):
197        """One table can go inside another one."""
198        markup = ('<table id="1">'
199                  '<tr>'
200                  "<td>Here's another table:"
201                  '<table id="2">'
202                  '<tr><td>foo</td></tr>'
203                  '</table></td>')
204
205        self.assertSoupEquals(
206            markup,
207            '<table id="1"><tr><td>Here\'s another table:'
208            '<table id="2"><tr><td>foo</td></tr></table>'
209            '</td></tr></table>')
210
211        self.assertSoupEquals(
212            "<table><thead><tr><td>Foo</td></tr></thead>"
213            "<tbody><tr><td>Bar</td></tr></tbody>"
214            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
215
216    def test_deeply_nested_multivalued_attribute(self):
217        # html5lib can set the attributes of the same tag many times
218        # as it rearranges the tree. This has caused problems with
219        # multivalued attributes.
220        markup = '<table><div><div class="css"></div></div></table>'
221        soup = self.soup(markup)
222        self.assertEqual(["css"], soup.div.div['class'])
223
224    def test_angle_brackets_in_attribute_values_are_escaped(self):
225        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
226
227    def test_entities_in_attributes_converted_to_unicode(self):
228        expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
229        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
230        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
231        self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
232        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
233
234    def test_entities_in_text_converted_to_unicode(self):
235        expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
236        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
237        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
238        self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
239        self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
240
241    def test_quot_entity_converted_to_quotation_mark(self):
242        self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
243                              '<p>I said "good day!"</p>')
244
245    def test_out_of_range_entity(self):
246        expect = u"\N{REPLACEMENT CHARACTER}"
247        self.assertSoupEquals("&#10000000000000;", expect)
248        self.assertSoupEquals("&#x10000000000000;", expect)
249        self.assertSoupEquals("&#1000000000;", expect)
250
251    def test_multipart_strings(self):
252        "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
253        soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
254        self.assertEqual("p", soup.h2.string.next_element.name)
255        self.assertEqual("p", soup.p.name)
256
257    def test_basic_namespaces(self):
258        """Parsers don't need to *understand* namespaces, but at the
259        very least they should not choke on namespaces or lose
260        data."""
261
262        markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
263        soup = self.soup(markup)
264        self.assertEqual(markup, soup.encode())
265        html = soup.html
266        self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
267        self.assertEqual(
268            'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
269        self.assertEqual(
270            'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
271
272    def test_multivalued_attribute_value_becomes_list(self):
273        markup = b'<a class="foo bar">'
274        soup = self.soup(markup)
275        self.assertEqual(['foo', 'bar'], soup.a['class'])
276
277    #
278    # Generally speaking, tests below this point are more tests of
279    # Beautiful Soup than tests of the tree builders. But parsers are
280    # weird, so we run these tests separately for every tree builder
281    # to detect any differences between them.
282    #
283
284    def test_can_parse_unicode_document(self):
285        # A seemingly innocuous document... but it's in Unicode! And
286        # it contains characters that can't be represented in the
287        # encoding found in the  declaration! The horror!
288        markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
289        soup = self.soup(markup)
290        self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
291
292    def test_soupstrainer(self):
293        """Parsers should be able to work with SoupStrainers."""
294        strainer = SoupStrainer("b")
295        soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
296                         parse_only=strainer)
297        self.assertEqual(soup.decode(), "<b>bold</b>")
298
299    def test_single_quote_attribute_values_become_double_quotes(self):
300        self.assertSoupEquals("<foo attr='bar'></foo>",
301                              '<foo attr="bar"></foo>')
302
303    def test_attribute_values_with_nested_quotes_are_left_alone(self):
304        text = """<foo attr='bar "brawls" happen'>a</foo>"""
305        self.assertSoupEquals(text)
306
307    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
308        text = """<foo attr='bar "brawls" happen'>a</foo>"""
309        soup = self.soup(text)
310        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
311        self.assertSoupEquals(
312            soup.foo.decode(),
313            """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
314
315    def test_ampersand_in_attribute_value_gets_escaped(self):
316        self.assertSoupEquals('<this is="really messed up & stuff"></this>',
317                              '<this is="really messed up &amp; stuff"></this>')
318
319        self.assertSoupEquals(
320            '<a href="http://example.org?a=1&b=2;3">foo</a>',
321            '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
322
323    def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
324        self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
325
326    def test_entities_in_strings_converted_during_parsing(self):
327        # Both XML and HTML entities are converted to Unicode characters
328        # during parsing.
329        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
330        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
331        self.assertSoupEquals(text, expected)
332
333    def test_smart_quotes_converted_on_the_way_in(self):
334        # Microsoft smart quotes are converted to Unicode characters during
335        # parsing.
336        quote = b"<p>\x91Foo\x92</p>"
337        soup = self.soup(quote)
338        self.assertEqual(
339            soup.p.string,
340            u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
341
342    def test_non_breaking_spaces_converted_on_the_way_in(self):
343        soup = self.soup("<a>&nbsp;&nbsp;</a>")
344        self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
345
346    def test_entities_converted_on_the_way_out(self):
347        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
348        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
349        soup = self.soup(text)
350        self.assertEqual(soup.p.encode("utf-8"), expected)
351
352    def test_real_iso_latin_document(self):
353        # Smoke test of interrelated functionality, using an
354        # easy-to-understand document.
355
356        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
357        unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
358
359        # That's because we're going to encode it into ISO-Latin-1, and use
360        # that to test.
361        iso_latin_html = unicode_html.encode("iso-8859-1")
362
363        # Parse the ISO-Latin-1 HTML.
364        soup = self.soup(iso_latin_html)
365        # Encode it to UTF-8.
366        result = soup.encode("utf-8")
367
368        # What do we expect the result to look like? Well, it would
369        # look like unicode_html, except that the META tag would say
370        # UTF-8 instead of ISO-Latin-1.
371        expected = unicode_html.replace("ISO-Latin-1", "utf-8")
372
373        # And, of course, it would be in UTF-8, not Unicode.
374        expected = expected.encode("utf-8")
375
376        # Ta-da!
377        self.assertEqual(result, expected)
378
379    def test_real_shift_jis_document(self):
380        # Smoke test to make sure the parser can handle a document in
381        # Shift-JIS encoding, without choking.
382        shift_jis_html = (
383            b'<html><head></head><body><pre>'
384            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
385            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
386            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
387            b'</pre></body></html>')
388        unicode_html = shift_jis_html.decode("shift-jis")
389        soup = self.soup(unicode_html)
390
391        # Make sure the parse tree is correctly encoded to various
392        # encodings.
393        self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
394        self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
395
396    def test_real_hebrew_document(self):
397        # A real-world test to make sure we can convert ISO-8859-9 (a
398        # Hebrew encoding) to UTF-8.
399        hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
400        soup = self.soup(
401            hebrew_document, from_encoding="iso8859-8")
402        self.assertEqual(soup.original_encoding, 'iso8859-8')
403        self.assertEqual(
404            soup.encode('utf-8'),
405            hebrew_document.decode("iso8859-8").encode("utf-8"))
406
407    def test_meta_tag_reflects_current_encoding(self):
408        # Here's the <meta> tag saying that a document is
409        # encoded in Shift-JIS.
410        meta_tag = ('<meta content="text/html; charset=x-sjis" '
411                    'http-equiv="Content-type"/>')
412
413        # Here's a document incorporating that meta tag.
414        shift_jis_html = (
415            '<html><head>\n%s\n'
416            '<meta http-equiv="Content-language" content="ja"/>'
417            '</head><body>Shift-JIS markup goes here.') % meta_tag
418        soup = self.soup(shift_jis_html)
419
420        # Parse the document, and the charset is seemingly unaffected.
421        parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
422        content = parsed_meta['content']
423        self.assertEqual('text/html; charset=x-sjis', content)
424
425        # But that value is actually a ContentMetaAttributeValue object.
426        self.assertTrue(isinstance(content, ContentMetaAttributeValue))
427
428        # And it will take on a value that reflects its current
429        # encoding.
430        self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
431
432        # For the rest of the story, see TestSubstitutions in
433        # test_tree.py.
434
435    def test_html5_style_meta_tag_reflects_current_encoding(self):
436        # Here's the <meta> tag saying that a document is
437        # encoded in Shift-JIS.
438        meta_tag = ('<meta id="encoding" charset="x-sjis" />')
439
440        # Here's a document incorporating that meta tag.
441        shift_jis_html = (
442            '<html><head>\n%s\n'
443            '<meta http-equiv="Content-language" content="ja"/>'
444            '</head><body>Shift-JIS markup goes here.') % meta_tag
445        soup = self.soup(shift_jis_html)
446
447        # Parse the document, and the charset is seemingly unaffected.
448        parsed_meta = soup.find('meta', id="encoding")
449        charset = parsed_meta['charset']
450        self.assertEqual('x-sjis', charset)
451
452        # But that value is actually a CharsetMetaAttributeValue object.
453        self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
454
455        # And it will take on a value that reflects its current
456        # encoding.
457        self.assertEqual('utf8', charset.encode("utf8"))
458
459    def test_tag_with_no_attributes_can_have_attributes_added(self):
460        data = self.soup("<a>text</a>")
461        data.a['foo'] = 'bar'
462        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
463
464class XMLTreeBuilderSmokeTest(object):
465
466    def test_docstring_generated(self):
467        soup = self.soup("<root/>")
468        self.assertEqual(
469            soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
470
471    def test_real_xhtml_document(self):
472        """A real XHTML document should come out *exactly* the same as it went in."""
473        markup = b"""<?xml version="1.0" encoding="utf-8"?>
474<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
475<html xmlns="http://www.w3.org/1999/xhtml">
476<head><title>Hello.</title></head>
477<body>Goodbye.</body>
478</html>"""
479        soup = self.soup(markup)
480        self.assertEqual(
481            soup.encode("utf-8"), markup)
482
483    def test_formatter_processes_script_tag_for_xml_documents(self):
484        doc = """
485  <script type="text/javascript">
486  </script>
487"""
488        soup = BeautifulSoup(doc, "xml")
489        # lxml would have stripped this while parsing, but we can add
490        # it later.
491        soup.script.string = 'console.log("< < hey > > ");'
492        encoded = soup.encode()
493        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
494
495    def test_can_parse_unicode_document(self):
496        markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
497        soup = self.soup(markup)
498        self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
499
500    def test_popping_namespaced_tag(self):
501        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
502        soup = self.soup(markup)
503        self.assertEqual(
504            unicode(soup.rss), markup)
505
506    def test_docstring_includes_correct_encoding(self):
507        soup = self.soup("<root/>")
508        self.assertEqual(
509            soup.encode("latin1"),
510            b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
511
512    def test_large_xml_document(self):
513        """A large XML document should come out the same as it went in."""
514        markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
515                  + b'0' * (2**12)
516                  + b'</root>')
517        soup = self.soup(markup)
518        self.assertEqual(soup.encode("utf-8"), markup)
519
520
521    def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
522        self.assertSoupEquals("<p>", "<p/>")
523        self.assertSoupEquals("<p>foo</p>")
524
525    def test_namespaces_are_preserved(self):
526        markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
527        soup = self.soup(markup)
528        root = soup.root
529        self.assertEqual("http://example.com/", root['xmlns:a'])
530        self.assertEqual("http://example.net/", root['xmlns:b'])
531
532    def test_closing_namespaced_tag(self):
533        markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
534        soup = self.soup(markup)
535        self.assertEqual(unicode(soup.p), markup)
536
537    def test_namespaced_attributes(self):
538        markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
539        soup = self.soup(markup)
540        self.assertEqual(unicode(soup.foo), markup)
541
542    def test_namespaced_attributes_xml_namespace(self):
543        markup = '<foo xml:lang="fr">bar</foo>'
544        soup = self.soup(markup)
545        self.assertEqual(unicode(soup.foo), markup)
546
547class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
548    """Smoke test for a tree builder that supports HTML5."""
549
550    def test_real_xhtml_document(self):
551        # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
552        # XHTML documents in any particular way.
553        pass
554
555    def test_html_tags_have_namespace(self):
556        markup = "<a>"
557        soup = self.soup(markup)
558        self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
559
560    def test_svg_tags_have_namespace(self):
561        markup = '<svg><circle/></svg>'
562        soup = self.soup(markup)
563        namespace = "http://www.w3.org/2000/svg"
564        self.assertEqual(namespace, soup.svg.namespace)
565        self.assertEqual(namespace, soup.circle.namespace)
566
567
568    def test_mathml_tags_have_namespace(self):
569        markup = '<math><msqrt>5</msqrt></math>'
570        soup = self.soup(markup)
571        namespace = 'http://www.w3.org/1998/Math/MathML'
572        self.assertEqual(namespace, soup.math.namespace)
573        self.assertEqual(namespace, soup.msqrt.namespace)
574
575    def test_xml_declaration_becomes_comment(self):
576        markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
577        soup = self.soup(markup)
578        self.assertTrue(isinstance(soup.contents[0], Comment))
579        self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
580        self.assertEqual("html", soup.contents[0].next_element.name)
581
582def skipIf(condition, reason):
583   def nothing(test, *args, **kwargs):
584       return None
585
586   def decorator(test_item):
587       if condition:
588           return nothing
589       else:
590           return test_item
591
592   return decorator
593