• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Tests to ensure that the lxml tree builder generates good trees."""
2
3import re
4import warnings
5
6try:
7    import lxml.etree
8    LXML_PRESENT = True
9    LXML_VERSION = lxml.etree.LXML_VERSION
10except ImportError, e:
11    LXML_PRESENT = False
12    LXML_VERSION = (0,)
13
14if LXML_PRESENT:
15    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
16
17from bs4 import (
18    BeautifulSoup,
19    BeautifulStoneSoup,
20    )
21from bs4.element import Comment, Doctype, SoupStrainer
22from bs4.testing import skipIf
23from bs4.tests import test_htmlparser
24from bs4.testing import (
25    HTMLTreeBuilderSmokeTest,
26    XMLTreeBuilderSmokeTest,
27    SoupTest,
28    skipIf,
29)
30
31@skipIf(
32    not LXML_PRESENT,
33    "lxml seems not to be present, not testing its tree builder.")
34class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
35    """See ``HTMLTreeBuilderSmokeTest``."""
36
37    @property
38    def default_builder(self):
39        return LXMLTreeBuilder()
40
41    def test_out_of_range_entity(self):
42        self.assertSoupEquals(
43            "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
44        self.assertSoupEquals(
45            "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
46        self.assertSoupEquals(
47            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
48
49    # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
50    # test if an old version of lxml is installed.
51
52    @skipIf(
53        not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
54        "Skipping doctype test for old version of lxml to avoid segfault.")
55    def test_empty_doctype(self):
56        soup = self.soup("<!DOCTYPE>")
57        doctype = soup.contents[0]
58        self.assertEqual("", doctype.strip())
59
60    def test_beautifulstonesoup_is_xml_parser(self):
61        # Make sure that the deprecated BSS class uses an xml builder
62        # if one is installed.
63        with warnings.catch_warnings(record=True) as w:
64            soup = BeautifulStoneSoup("<b />")
65        self.assertEqual(u"<b/>", unicode(soup.b))
66        self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
67
68    def test_real_xhtml_document(self):
69        """lxml strips the XML definition from an XHTML doc, which is fine."""
70        markup = b"""<?xml version="1.0" encoding="utf-8"?>
71<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
72<html xmlns="http://www.w3.org/1999/xhtml">
73<head><title>Hello.</title></head>
74<body>Goodbye.</body>
75</html>"""
76        soup = self.soup(markup)
77        self.assertEqual(
78            soup.encode("utf-8").replace(b"\n", b''),
79            markup.replace(b'\n', b'').replace(
80                b'<?xml version="1.0" encoding="utf-8"?>', b''))
81
82
83@skipIf(
84    not LXML_PRESENT,
85    "lxml seems not to be present, not testing its XML tree builder.")
86class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
87    """See ``HTMLTreeBuilderSmokeTest``."""
88
89    @property
90    def default_builder(self):
91        return LXMLTreeBuilderForXML()
92