• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/
5
6Beautiful Soup uses a pluggable XML or HTML parser to parse a
7(possibly invalid) document into a tree representation. Beautiful Soup
8provides provides methods and Pythonic idioms that make it easy to
9navigate, search, and modify the parse tree.
10
11Beautiful Soup works with Python 2.6 and up. It works better if lxml
12and/or html5lib is installed.
13
14For more than you ever wanted to know about Beautiful Soup, see the
15documentation:
16http://www.crummy.com/software/BeautifulSoup/bs4/doc/
17"""
18
19__author__ = "Leonard Richardson (leonardr@segfault.org)"
20__version__ = "4.3.2"
21__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
22__license__ = "MIT"
23
24__all__ = ['BeautifulSoup']
25
26import os
27import re
28import warnings
29
30from .builder import builder_registry, ParserRejectedMarkup
31from .dammit import UnicodeDammit
32from .element import (
33    CData,
34    Comment,
35    DEFAULT_OUTPUT_ENCODING,
36    Declaration,
37    Doctype,
38    NavigableString,
39    PageElement,
40    ProcessingInstruction,
41    ResultSet,
42    SoupStrainer,
43    Tag,
44    )
45
46# The very first thing we do is give a useful error if someone is
47# running this code under Python 3 without converting it.
48syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
49
50class BeautifulSoup(Tag):
51    """
52    This class defines the basic interface called by the tree builders.
53
54    These methods will be called by the parser:
55      reset()
56      feed(markup)
57
58    The tree builder may call these methods from its feed() implementation:
59      handle_starttag(name, attrs) # See note about return value
60      handle_endtag(name)
61      handle_data(data) # Appends to the current data node
62      endData(containerClass=NavigableString) # Ends the current data node
63
64    No matter how complicated the underlying parser is, you should be
65    able to build a tree using 'start tag' events, 'end tag' events,
66    'data' events, and "done with data" events.
67
68    If you encounter an empty-element tag (aka a self-closing tag,
69    like HTML's <br> tag), call handle_starttag and then
70    handle_endtag.
71    """
72    ROOT_TAG_NAME = u'[document]'
73
74    # If the end-user gives no indication which tree builder they
75    # want, look for one with these features.
76    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
77
78    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
79
80    def __init__(self, markup="", features=None, builder=None,
81                 parse_only=None, from_encoding=None, **kwargs):
82        """The Soup object is initialized as the 'root tag', and the
83        provided markup (which can be a string or a file-like object)
84        is fed into the underlying parser."""
85
86        if 'convertEntities' in kwargs:
87            warnings.warn(
88                "BS4 does not respect the convertEntities argument to the "
89                "BeautifulSoup constructor. Entities are always converted "
90                "to Unicode characters.")
91
92        if 'markupMassage' in kwargs:
93            del kwargs['markupMassage']
94            warnings.warn(
95                "BS4 does not respect the markupMassage argument to the "
96                "BeautifulSoup constructor. The tree builder is responsible "
97                "for any necessary markup massage.")
98
99        if 'smartQuotesTo' in kwargs:
100            del kwargs['smartQuotesTo']
101            warnings.warn(
102                "BS4 does not respect the smartQuotesTo argument to the "
103                "BeautifulSoup constructor. Smart quotes are always converted "
104                "to Unicode characters.")
105
106        if 'selfClosingTags' in kwargs:
107            del kwargs['selfClosingTags']
108            warnings.warn(
109                "BS4 does not respect the selfClosingTags argument to the "
110                "BeautifulSoup constructor. The tree builder is responsible "
111                "for understanding self-closing tags.")
112
113        if 'isHTML' in kwargs:
114            del kwargs['isHTML']
115            warnings.warn(
116                "BS4 does not respect the isHTML argument to the "
117                "BeautifulSoup constructor. You can pass in features='html' "
118                "or features='xml' to get a builder capable of handling "
119                "one or the other.")
120
121        def deprecated_argument(old_name, new_name):
122            if old_name in kwargs:
123                warnings.warn(
124                    'The "%s" argument to the BeautifulSoup constructor '
125                    'has been renamed to "%s."' % (old_name, new_name))
126                value = kwargs[old_name]
127                del kwargs[old_name]
128                return value
129            return None
130
131        parse_only = parse_only or deprecated_argument(
132            "parseOnlyThese", "parse_only")
133
134        from_encoding = from_encoding or deprecated_argument(
135            "fromEncoding", "from_encoding")
136
137        if len(kwargs) > 0:
138            arg = kwargs.keys().pop()
139            raise TypeError(
140                "__init__() got an unexpected keyword argument '%s'" % arg)
141
142        if builder is None:
143            if isinstance(features, basestring):
144                features = [features]
145            if features is None or len(features) == 0:
146                features = self.DEFAULT_BUILDER_FEATURES
147            builder_class = builder_registry.lookup(*features)
148            if builder_class is None:
149                raise FeatureNotFound(
150                    "Couldn't find a tree builder with the features you "
151                    "requested: %s. Do you need to install a parser library?"
152                    % ",".join(features))
153            builder = builder_class()
154        self.builder = builder
155        self.is_xml = builder.is_xml
156        self.builder.soup = self
157
158        self.parse_only = parse_only
159
160        if hasattr(markup, 'read'):        # It's a file-type object.
161            markup = markup.read()
162        elif len(markup) <= 256:
163            # Print out warnings for a couple beginner problems
164            # involving passing non-markup to Beautiful Soup.
165            # Beautiful Soup will still parse the input as markup,
166            # just in case that's what the user really wants.
167            if (isinstance(markup, unicode)
168                and not os.path.supports_unicode_filenames):
169                possible_filename = markup.encode("utf8")
170            else:
171                possible_filename = markup
172            is_file = False
173            try:
174                is_file = os.path.exists(possible_filename)
175            except Exception, e:
176                # This is almost certainly a problem involving
177                # characters not valid in filenames on this
178                # system. Just let it go.
179                pass
180            if is_file:
181                warnings.warn(
182                    '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
183            if markup[:5] == "http:" or markup[:6] == "https:":
184                # TODO: This is ugly but I couldn't get it to work in
185                # Python 3 otherwise.
186                if ((isinstance(markup, bytes) and not b' ' in markup)
187                    or (isinstance(markup, unicode) and not u' ' in markup)):
188                    warnings.warn(
189                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
190
191        for (self.markup, self.original_encoding, self.declared_html_encoding,
192         self.contains_replacement_characters) in (
193            self.builder.prepare_markup(markup, from_encoding)):
194            self.reset()
195            try:
196                self._feed()
197                break
198            except ParserRejectedMarkup:
199                pass
200
201        # Clear out the markup and remove the builder's circular
202        # reference to this object.
203        self.markup = None
204        self.builder.soup = None
205
206    def _feed(self):
207        # Convert the document to Unicode.
208        self.builder.reset()
209
210        self.builder.feed(self.markup)
211        # Close out any unfinished strings and close all the open tags.
212        self.endData()
213        while self.currentTag.name != self.ROOT_TAG_NAME:
214            self.popTag()
215
216    def reset(self):
217        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
218        self.hidden = 1
219        self.builder.reset()
220        self.current_data = []
221        self.currentTag = None
222        self.tagStack = []
223        self.preserve_whitespace_tag_stack = []
224        self.pushTag(self)
225
226    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
227        """Create a new tag associated with this soup."""
228        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
229
230    def new_string(self, s, subclass=NavigableString):
231        """Create a new NavigableString associated with this soup."""
232        navigable = subclass(s)
233        navigable.setup()
234        return navigable
235
236    def insert_before(self, successor):
237        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
238
239    def insert_after(self, successor):
240        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
241
242    def popTag(self):
243        tag = self.tagStack.pop()
244        if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
245            self.preserve_whitespace_tag_stack.pop()
246        #print "Pop", tag.name
247        if self.tagStack:
248            self.currentTag = self.tagStack[-1]
249        return self.currentTag
250
251    def pushTag(self, tag):
252        #print "Push", tag.name
253        if self.currentTag:
254            self.currentTag.contents.append(tag)
255        self.tagStack.append(tag)
256        self.currentTag = self.tagStack[-1]
257        if tag.name in self.builder.preserve_whitespace_tags:
258            self.preserve_whitespace_tag_stack.append(tag)
259
260    def endData(self, containerClass=NavigableString):
261        if self.current_data:
262            current_data = u''.join(self.current_data)
263            # If whitespace is not preserved, and this string contains
264            # nothing but ASCII spaces, replace it with a single space
265            # or newline.
266            if not self.preserve_whitespace_tag_stack:
267                strippable = True
268                for i in current_data:
269                    if i not in self.ASCII_SPACES:
270                        strippable = False
271                        break
272                if strippable:
273                    if '\n' in current_data:
274                        current_data = '\n'
275                    else:
276                        current_data = ' '
277
278            # Reset the data collector.
279            self.current_data = []
280
281            # Should we add this string to the tree at all?
282            if self.parse_only and len(self.tagStack) <= 1 and \
283                   (not self.parse_only.text or \
284                    not self.parse_only.search(current_data)):
285                return
286
287            o = containerClass(current_data)
288            self.object_was_parsed(o)
289
290    def object_was_parsed(self, o, parent=None, most_recent_element=None):
291        """Add an object to the parse tree."""
292        parent = parent or self.currentTag
293        most_recent_element = most_recent_element or self._most_recent_element
294        o.setup(parent, most_recent_element)
295
296        if most_recent_element is not None:
297            most_recent_element.next_element = o
298        self._most_recent_element = o
299        parent.contents.append(o)
300
301    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
302        """Pops the tag stack up to and including the most recent
303        instance of the given tag. If inclusivePop is false, pops the tag
304        stack up to but *not* including the most recent instqance of
305        the given tag."""
306        #print "Popping to %s" % name
307        if name == self.ROOT_TAG_NAME:
308            # The BeautifulSoup object itself can never be popped.
309            return
310
311        most_recently_popped = None
312
313        stack_size = len(self.tagStack)
314        for i in range(stack_size - 1, 0, -1):
315            t = self.tagStack[i]
316            if (name == t.name and nsprefix == t.prefix):
317                if inclusivePop:
318                    most_recently_popped = self.popTag()
319                break
320            most_recently_popped = self.popTag()
321
322        return most_recently_popped
323
324    def handle_starttag(self, name, namespace, nsprefix, attrs):
325        """Push a start tag on to the stack.
326
327        If this method returns None, the tag was rejected by the
328        SoupStrainer. You should proceed as if the tag had not occured
329        in the document. For instance, if this was a self-closing tag,
330        don't call handle_endtag.
331        """
332
333        # print "Start tag %s: %s" % (name, attrs)
334        self.endData()
335
336        if (self.parse_only and len(self.tagStack) <= 1
337            and (self.parse_only.text
338                 or not self.parse_only.search_tag(name, attrs))):
339            return None
340
341        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
342                  self.currentTag, self._most_recent_element)
343        if tag is None:
344            return tag
345        if self._most_recent_element:
346            self._most_recent_element.next_element = tag
347        self._most_recent_element = tag
348        self.pushTag(tag)
349        return tag
350
351    def handle_endtag(self, name, nsprefix=None):
352        #print "End tag: " + name
353        self.endData()
354        self._popToTag(name, nsprefix)
355
356    def handle_data(self, data):
357        self.current_data.append(data)
358
359    def decode(self, pretty_print=False,
360               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
361               formatter="minimal"):
362        """Returns a string or Unicode representation of this document.
363        To get Unicode, pass None for encoding."""
364
365        if self.is_xml:
366            # Print the XML declaration
367            encoding_part = ''
368            if eventual_encoding != None:
369                encoding_part = ' encoding="%s"' % eventual_encoding
370            prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
371        else:
372            prefix = u''
373        if not pretty_print:
374            indent_level = None
375        else:
376            indent_level = 0
377        return prefix + super(BeautifulSoup, self).decode(
378            indent_level, eventual_encoding, formatter)
379
380# Alias to make it easier to type import: 'from bs4 import _soup'
381_s = BeautifulSoup
382_soup = BeautifulSoup
383
384class BeautifulStoneSoup(BeautifulSoup):
385    """Deprecated interface to an XML parser."""
386
387    def __init__(self, *args, **kwargs):
388        kwargs['features'] = 'xml'
389        warnings.warn(
390            'The BeautifulStoneSoup class is deprecated. Instead of using '
391            'it, pass features="xml" into the BeautifulSoup constructor.')
392        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
393
394
395class StopParsing(Exception):
396    pass
397
398class FeatureNotFound(ValueError):
399    pass
400
401
402#By default, act as an HTML pretty-printer.
403if __name__ == '__main__':
404    import sys
405    soup = BeautifulSoup(sys.stdin)
406    print soup.prettify()
407