1"""Tests to ensure that the lxml tree builder generates good trees.""" 2 3import re 4import warnings 5 6try: 7 import lxml.etree 8 LXML_PRESENT = True 9 LXML_VERSION = lxml.etree.LXML_VERSION 10except ImportError, e: 11 LXML_PRESENT = False 12 LXML_VERSION = (0,) 13 14if LXML_PRESENT: 15 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 16 17from bs4 import ( 18 BeautifulSoup, 19 BeautifulStoneSoup, 20 ) 21from bs4.element import Comment, Doctype, SoupStrainer 22from bs4.testing import skipIf 23from bs4.tests import test_htmlparser 24from bs4.testing import ( 25 HTMLTreeBuilderSmokeTest, 26 XMLTreeBuilderSmokeTest, 27 SoupTest, 28 skipIf, 29) 30 31@skipIf( 32 not LXML_PRESENT, 33 "lxml seems not to be present, not testing its tree builder.") 34class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 35 """See ``HTMLTreeBuilderSmokeTest``.""" 36 37 @property 38 def default_builder(self): 39 return LXMLTreeBuilder() 40 41 def test_out_of_range_entity(self): 42 self.assertSoupEquals( 43 "<p>foo�bar</p>", "<p>foobar</p>") 44 self.assertSoupEquals( 45 "<p>foo�bar</p>", "<p>foobar</p>") 46 self.assertSoupEquals( 47 "<p>foo�bar</p>", "<p>foobar</p>") 48 49 # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 # test if an old version of lxml is installed. 51 52 @skipIf( 53 not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 "Skipping doctype test for old version of lxml to avoid segfault.") 55 def test_empty_doctype(self): 56 soup = self.soup("<!DOCTYPE>") 57 doctype = soup.contents[0] 58 self.assertEqual("", doctype.strip()) 59 60 def test_beautifulstonesoup_is_xml_parser(self): 61 # Make sure that the deprecated BSS class uses an xml builder 62 # if one is installed. 63 with warnings.catch_warnings(record=True) as w: 64 soup = BeautifulStoneSoup("<b />") 65 self.assertEqual(u"<b/>", unicode(soup.b)) 66 self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 68 def test_real_xhtml_document(self): 69 """lxml strips the XML definition from an XHTML doc, which is fine.""" 70 markup = b"""<?xml version="1.0" encoding="utf-8"?> 71<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> 72<html xmlns="http://www.w3.org/1999/xhtml"> 73<head><title>Hello.</title></head> 74<body>Goodbye.</body> 75</html>""" 76 soup = self.soup(markup) 77 self.assertEqual( 78 soup.encode("utf-8").replace(b"\n", b''), 79 markup.replace(b'\n', b'').replace( 80 b'<?xml version="1.0" encoding="utf-8"?>', b'')) 81 82 83@skipIf( 84 not LXML_PRESENT, 85 "lxml seems not to be present, not testing its XML tree builder.") 86class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 87 """See ``HTMLTreeBuilderSmokeTest``.""" 88 89 @property 90 def default_builder(self): 91 return LXMLTreeBuilderForXML() 92