• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# -*- coding: utf-8 -*-
2"""Beautiful Soup bonus library: Unicode, Dammit
3
4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's Universal
6Feed Parser. It works best on XML and XML, but it does not rewrite the
7XML or HTML to reflect a new encoding; that's the tree builder's job.
8"""
9
10import codecs
11from htmlentitydefs import codepoint2name
12import re
13import logging
14import string
15
16# Import a library to autodetect character encodings.
17chardet_type = None
18try:
19    # First try the fast C implementation.
20    #  PyPI package: cchardet
21    import cchardet
22    def chardet_dammit(s):
23        return cchardet.detect(s)['encoding']
24except ImportError:
25    try:
26        # Fall back to the pure Python implementation
27        #  Debian package: python-chardet
28        #  PyPI package: chardet
29        import chardet
30        def chardet_dammit(s):
31            return chardet.detect(s)['encoding']
32        #import chardet.constants
33        #chardet.constants._debug = 1
34    except ImportError:
35        # No chardet available.
36        def chardet_dammit(s):
37            return None
38
39# Available from http://cjkpython.i18n.org/.
40try:
41    import iconv_codec
42except ImportError:
43    pass
44
45xml_encoding_re = re.compile(
46    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
47html_meta_re = re.compile(
48    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
49
50class EntitySubstitution(object):
51
52    """Substitute XML or HTML entities for the corresponding characters."""
53
54    def _populate_class_variables():
55        lookup = {}
56        reverse_lookup = {}
57        characters_for_re = []
58        for codepoint, name in list(codepoint2name.items()):
59            character = unichr(codepoint)
60            if codepoint != 34:
61                # There's no point in turning the quotation mark into
62                # &quot;, unless it happens within an attribute value, which
63                # is handled elsewhere.
64                characters_for_re.append(character)
65                lookup[character] = name
66            # But we do want to turn &quot; into the quotation mark.
67            reverse_lookup[name] = character
68        re_definition = "[%s]" % "".join(characters_for_re)
69        return lookup, reverse_lookup, re.compile(re_definition)
70    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
71     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
72
73    CHARACTER_TO_XML_ENTITY = {
74        "'": "apos",
75        '"': "quot",
76        "&": "amp",
77        "<": "lt",
78        ">": "gt",
79        }
80
81    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
82                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
83                                           ")")
84
85    AMPERSAND_OR_BRACKET = re.compile("([<>&])")
86
87    @classmethod
88    def _substitute_html_entity(cls, matchobj):
89        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
90        return "&%s;" % entity
91
92    @classmethod
93    def _substitute_xml_entity(cls, matchobj):
94        """Used with a regular expression to substitute the
95        appropriate XML entity for an XML special character."""
96        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
97        return "&%s;" % entity
98
99    @classmethod
100    def quoted_attribute_value(self, value):
101        """Make a value into a quoted XML attribute, possibly escaping it.
102
103         Most strings will be quoted using double quotes.
104
105          Bob's Bar -> "Bob's Bar"
106
107         If a string contains double quotes, it will be quoted using
108         single quotes.
109
110          Welcome to "my bar" -> 'Welcome to "my bar"'
111
112         If a string contains both single and double quotes, the
113         double quotes will be escaped, and the string will be quoted
114         using double quotes.
115
116          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
117        """
118        quote_with = '"'
119        if '"' in value:
120            if "'" in value:
121                # The string contains both single and double
122                # quotes.  Turn the double quotes into
123                # entities. We quote the double quotes rather than
124                # the single quotes because the entity name is
125                # "&quot;" whether this is HTML or XML.  If we
126                # quoted the single quotes, we'd have to decide
127                # between &apos; and &squot;.
128                replace_with = "&quot;"
129                value = value.replace('"', replace_with)
130            else:
131                # There are double quotes but no single quotes.
132                # We can use single quotes to quote the attribute.
133                quote_with = "'"
134        return quote_with + value + quote_with
135
136    @classmethod
137    def substitute_xml(cls, value, make_quoted_attribute=False):
138        """Substitute XML entities for special XML characters.
139
140        :param value: A string to be substituted. The less-than sign
141          will become &lt;, the greater-than sign will become &gt;,
142          and any ampersands will become &amp;. If you want ampersands
143          that appear to be part of an entity definition to be left
144          alone, use substitute_xml_containing_entities() instead.
145
146        :param make_quoted_attribute: If True, then the string will be
147         quoted, as befits an attribute value.
148        """
149        # Escape angle brackets and ampersands.
150        value = cls.AMPERSAND_OR_BRACKET.sub(
151            cls._substitute_xml_entity, value)
152
153        if make_quoted_attribute:
154            value = cls.quoted_attribute_value(value)
155        return value
156
157    @classmethod
158    def substitute_xml_containing_entities(
159        cls, value, make_quoted_attribute=False):
160        """Substitute XML entities for special XML characters.
161
162        :param value: A string to be substituted. The less-than sign will
163          become &lt;, the greater-than sign will become &gt;, and any
164          ampersands that are not part of an entity defition will
165          become &amp;.
166
167        :param make_quoted_attribute: If True, then the string will be
168         quoted, as befits an attribute value.
169        """
170        # Escape angle brackets, and ampersands that aren't part of
171        # entities.
172        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
173            cls._substitute_xml_entity, value)
174
175        if make_quoted_attribute:
176            value = cls.quoted_attribute_value(value)
177        return value
178
179    @classmethod
180    def substitute_html(cls, s):
181        """Replace certain Unicode characters with named HTML entities.
182
183        This differs from data.encode(encoding, 'xmlcharrefreplace')
184        in that the goal is to make the result more readable (to those
185        with ASCII displays) rather than to recover from
186        errors. There's absolutely nothing wrong with a UTF-8 string
187        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
188        character with "&eacute;" will make it more readable to some
189        people.
190        """
191        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
192            cls._substitute_html_entity, s)
193
194
195class EncodingDetector:
196    """Suggests a number of possible encodings for a bytestring.
197
198    Order of precedence:
199
200    1. Encodings you specifically tell EncodingDetector to try first
201    (the override_encodings argument to the constructor).
202
203    2. An encoding declared within the bytestring itself, either in an
204    XML declaration (if the bytestring is to be interpreted as an XML
205    document), or in a <meta> tag (if the bytestring is to be
206    interpreted as an HTML document.)
207
208    3. An encoding detected through textual analysis by chardet,
209    cchardet, or a similar external library.
210
211    4. UTF-8.
212
213    5. Windows-1252.
214    """
215    def __init__(self, markup, override_encodings=None, is_html=False):
216        self.override_encodings = override_encodings or []
217        self.chardet_encoding = None
218        self.is_html = is_html
219        self.declared_encoding = None
220
221        # First order of business: strip a byte-order mark.
222        self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
223
224    def _usable(self, encoding, tried):
225        if encoding is not None:
226            encoding = encoding.lower()
227            if encoding not in tried:
228                tried.add(encoding)
229                return True
230        return False
231
232    @property
233    def encodings(self):
234        """Yield a number of encodings that might work for this markup."""
235        tried = set()
236        for e in self.override_encodings:
237            if self._usable(e, tried):
238                yield e
239
240        # Did the document originally start with a byte-order mark
241        # that indicated its encoding?
242        if self._usable(self.sniffed_encoding, tried):
243            yield self.sniffed_encoding
244
245        # Look within the document for an XML or HTML encoding
246        # declaration.
247        if self.declared_encoding is None:
248            self.declared_encoding = self.find_declared_encoding(
249                self.markup, self.is_html)
250        if self._usable(self.declared_encoding, tried):
251            yield self.declared_encoding
252
253        # Use third-party character set detection to guess at the
254        # encoding.
255        if self.chardet_encoding is None:
256            self.chardet_encoding = chardet_dammit(self.markup)
257        if self._usable(self.chardet_encoding, tried):
258            yield self.chardet_encoding
259
260        # As a last-ditch effort, try utf-8 and windows-1252.
261        for e in ('utf-8', 'windows-1252'):
262            if self._usable(e, tried):
263                yield e
264
265    @classmethod
266    def strip_byte_order_mark(cls, data):
267        """If a byte-order mark is present, strip it and return the encoding it implies."""
268        encoding = None
269        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
270               and (data[2:4] != '\x00\x00'):
271            encoding = 'utf-16be'
272            data = data[2:]
273        elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
274                 and (data[2:4] != '\x00\x00'):
275            encoding = 'utf-16le'
276            data = data[2:]
277        elif data[:3] == b'\xef\xbb\xbf':
278            encoding = 'utf-8'
279            data = data[3:]
280        elif data[:4] == b'\x00\x00\xfe\xff':
281            encoding = 'utf-32be'
282            data = data[4:]
283        elif data[:4] == b'\xff\xfe\x00\x00':
284            encoding = 'utf-32le'
285            data = data[4:]
286        return data, encoding
287
288    @classmethod
289    def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
290        """Given a document, tries to find its declared encoding.
291
292        An XML encoding is declared at the beginning of the document.
293
294        An HTML encoding is declared in a <meta> tag, hopefully near the
295        beginning of the document.
296        """
297        if search_entire_document:
298            xml_endpos = html_endpos = len(markup)
299        else:
300            xml_endpos = 1024
301            html_endpos = max(2048, int(len(markup) * 0.05))
302
303        declared_encoding = None
304        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
305        if not declared_encoding_match and is_html:
306            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
307        if declared_encoding_match is not None:
308            declared_encoding = declared_encoding_match.groups()[0].decode(
309                'ascii')
310        if declared_encoding:
311            return declared_encoding.lower()
312        return None
313
314class UnicodeDammit:
315    """A class for detecting the encoding of a *ML document and
316    converting it to a Unicode string. If the source encoding is
317    windows-1252, can replace MS smart quotes with their HTML or XML
318    equivalents."""
319
320    # This dictionary maps commonly seen values for "charset" in HTML
321    # meta tags to the corresponding Python codec names. It only covers
322    # values that aren't in Python's aliases and can't be determined
323    # by the heuristics in find_codec.
324    CHARSET_ALIASES = {"macintosh": "mac-roman",
325                       "x-sjis": "shift-jis"}
326
327    ENCODINGS_WITH_SMART_QUOTES = [
328        "windows-1252",
329        "iso-8859-1",
330        "iso-8859-2",
331        ]
332
333    def __init__(self, markup, override_encodings=[],
334                 smart_quotes_to=None, is_html=False):
335        self.smart_quotes_to = smart_quotes_to
336        self.tried_encodings = []
337        self.contains_replacement_characters = False
338        self.is_html = is_html
339
340        self.detector = EncodingDetector(markup, override_encodings, is_html)
341
342        # Short-circuit if the data is in Unicode to begin with.
343        if isinstance(markup, unicode) or markup == '':
344            self.markup = markup
345            self.unicode_markup = unicode(markup)
346            self.original_encoding = None
347            return
348
349        # The encoding detector may have stripped a byte-order mark.
350        # Use the stripped markup from this point on.
351        self.markup = self.detector.markup
352
353        u = None
354        for encoding in self.detector.encodings:
355            markup = self.detector.markup
356            u = self._convert_from(encoding)
357            if u is not None:
358                break
359
360        if not u:
361            # None of the encodings worked. As an absolute last resort,
362            # try them again with character replacement.
363
364            for encoding in self.detector.encodings:
365                if encoding != "ascii":
366                    u = self._convert_from(encoding, "replace")
367                if u is not None:
368                    logging.warning(
369                            "Some characters could not be decoded, and were "
370                            "replaced with REPLACEMENT CHARACTER.")
371                    self.contains_replacement_characters = True
372                    break
373
374        # If none of that worked, we could at this point force it to
375        # ASCII, but that would destroy so much data that I think
376        # giving up is better.
377        self.unicode_markup = u
378        if not u:
379            self.original_encoding = None
380
381    def _sub_ms_char(self, match):
382        """Changes a MS smart quote character to an XML or HTML
383        entity, or an ASCII character."""
384        orig = match.group(1)
385        if self.smart_quotes_to == 'ascii':
386            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
387        else:
388            sub = self.MS_CHARS.get(orig)
389            if type(sub) == tuple:
390                if self.smart_quotes_to == 'xml':
391                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
392                else:
393                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
394            else:
395                sub = sub.encode()
396        return sub
397
398    def _convert_from(self, proposed, errors="strict"):
399        proposed = self.find_codec(proposed)
400        if not proposed or (proposed, errors) in self.tried_encodings:
401            return None
402        self.tried_encodings.append((proposed, errors))
403        markup = self.markup
404        # Convert smart quotes to HTML if coming from an encoding
405        # that might have them.
406        if (self.smart_quotes_to is not None
407            and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
408            smart_quotes_re = b"([\x80-\x9f])"
409            smart_quotes_compiled = re.compile(smart_quotes_re)
410            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
411
412        try:
413            #print "Trying to convert document to %s (errors=%s)" % (
414            #    proposed, errors)
415            u = self._to_unicode(markup, proposed, errors)
416            self.markup = u
417            self.original_encoding = proposed
418        except Exception as e:
419            #print "That didn't work!"
420            #print e
421            return None
422        #print "Correct encoding: %s" % proposed
423        return self.markup
424
425    def _to_unicode(self, data, encoding, errors="strict"):
426        '''Given a string and its encoding, decodes the string into Unicode.
427        %encoding is a string recognized by encodings.aliases'''
428        return unicode(data, encoding, errors)
429
430    @property
431    def declared_html_encoding(self):
432        if not self.is_html:
433            return None
434        return self.detector.declared_encoding
435
436    def find_codec(self, charset):
437        value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
438               or (charset and self._codec(charset.replace("-", "")))
439               or (charset and self._codec(charset.replace("-", "_")))
440               or (charset and charset.lower())
441               or charset
442                )
443        if value:
444            return value.lower()
445        return None
446
447    def _codec(self, charset):
448        if not charset:
449            return charset
450        codec = None
451        try:
452            codecs.lookup(charset)
453            codec = charset
454        except (LookupError, ValueError):
455            pass
456        return codec
457
458
459    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
460    MS_CHARS = {b'\x80': ('euro', '20AC'),
461                b'\x81': ' ',
462                b'\x82': ('sbquo', '201A'),
463                b'\x83': ('fnof', '192'),
464                b'\x84': ('bdquo', '201E'),
465                b'\x85': ('hellip', '2026'),
466                b'\x86': ('dagger', '2020'),
467                b'\x87': ('Dagger', '2021'),
468                b'\x88': ('circ', '2C6'),
469                b'\x89': ('permil', '2030'),
470                b'\x8A': ('Scaron', '160'),
471                b'\x8B': ('lsaquo', '2039'),
472                b'\x8C': ('OElig', '152'),
473                b'\x8D': '?',
474                b'\x8E': ('#x17D', '17D'),
475                b'\x8F': '?',
476                b'\x90': '?',
477                b'\x91': ('lsquo', '2018'),
478                b'\x92': ('rsquo', '2019'),
479                b'\x93': ('ldquo', '201C'),
480                b'\x94': ('rdquo', '201D'),
481                b'\x95': ('bull', '2022'),
482                b'\x96': ('ndash', '2013'),
483                b'\x97': ('mdash', '2014'),
484                b'\x98': ('tilde', '2DC'),
485                b'\x99': ('trade', '2122'),
486                b'\x9a': ('scaron', '161'),
487                b'\x9b': ('rsaquo', '203A'),
488                b'\x9c': ('oelig', '153'),
489                b'\x9d': '?',
490                b'\x9e': ('#x17E', '17E'),
491                b'\x9f': ('Yuml', ''),}
492
493    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
494    # horrors like stripping diacritical marks to turn á into a, but also
495    # contains non-horrors like turning “ into ".
496    MS_CHARS_TO_ASCII = {
497        b'\x80' : 'EUR',
498        b'\x81' : ' ',
499        b'\x82' : ',',
500        b'\x83' : 'f',
501        b'\x84' : ',,',
502        b'\x85' : '...',
503        b'\x86' : '+',
504        b'\x87' : '++',
505        b'\x88' : '^',
506        b'\x89' : '%',
507        b'\x8a' : 'S',
508        b'\x8b' : '<',
509        b'\x8c' : 'OE',
510        b'\x8d' : '?',
511        b'\x8e' : 'Z',
512        b'\x8f' : '?',
513        b'\x90' : '?',
514        b'\x91' : "'",
515        b'\x92' : "'",
516        b'\x93' : '"',
517        b'\x94' : '"',
518        b'\x95' : '*',
519        b'\x96' : '-',
520        b'\x97' : '--',
521        b'\x98' : '~',
522        b'\x99' : '(TM)',
523        b'\x9a' : 's',
524        b'\x9b' : '>',
525        b'\x9c' : 'oe',
526        b'\x9d' : '?',
527        b'\x9e' : 'z',
528        b'\x9f' : 'Y',
529        b'\xa0' : ' ',
530        b'\xa1' : '!',
531        b'\xa2' : 'c',
532        b'\xa3' : 'GBP',
533        b'\xa4' : '$', #This approximation is especially parochial--this is the
534                       #generic currency symbol.
535        b'\xa5' : 'YEN',
536        b'\xa6' : '|',
537        b'\xa7' : 'S',
538        b'\xa8' : '..',
539        b'\xa9' : '',
540        b'\xaa' : '(th)',
541        b'\xab' : '<<',
542        b'\xac' : '!',
543        b'\xad' : ' ',
544        b'\xae' : '(R)',
545        b'\xaf' : '-',
546        b'\xb0' : 'o',
547        b'\xb1' : '+-',
548        b'\xb2' : '2',
549        b'\xb3' : '3',
550        b'\xb4' : ("'", 'acute'),
551        b'\xb5' : 'u',
552        b'\xb6' : 'P',
553        b'\xb7' : '*',
554        b'\xb8' : ',',
555        b'\xb9' : '1',
556        b'\xba' : '(th)',
557        b'\xbb' : '>>',
558        b'\xbc' : '1/4',
559        b'\xbd' : '1/2',
560        b'\xbe' : '3/4',
561        b'\xbf' : '?',
562        b'\xc0' : 'A',
563        b'\xc1' : 'A',
564        b'\xc2' : 'A',
565        b'\xc3' : 'A',
566        b'\xc4' : 'A',
567        b'\xc5' : 'A',
568        b'\xc6' : 'AE',
569        b'\xc7' : 'C',
570        b'\xc8' : 'E',
571        b'\xc9' : 'E',
572        b'\xca' : 'E',
573        b'\xcb' : 'E',
574        b'\xcc' : 'I',
575        b'\xcd' : 'I',
576        b'\xce' : 'I',
577        b'\xcf' : 'I',
578        b'\xd0' : 'D',
579        b'\xd1' : 'N',
580        b'\xd2' : 'O',
581        b'\xd3' : 'O',
582        b'\xd4' : 'O',
583        b'\xd5' : 'O',
584        b'\xd6' : 'O',
585        b'\xd7' : '*',
586        b'\xd8' : 'O',
587        b'\xd9' : 'U',
588        b'\xda' : 'U',
589        b'\xdb' : 'U',
590        b'\xdc' : 'U',
591        b'\xdd' : 'Y',
592        b'\xde' : 'b',
593        b'\xdf' : 'B',
594        b'\xe0' : 'a',
595        b'\xe1' : 'a',
596        b'\xe2' : 'a',
597        b'\xe3' : 'a',
598        b'\xe4' : 'a',
599        b'\xe5' : 'a',
600        b'\xe6' : 'ae',
601        b'\xe7' : 'c',
602        b'\xe8' : 'e',
603        b'\xe9' : 'e',
604        b'\xea' : 'e',
605        b'\xeb' : 'e',
606        b'\xec' : 'i',
607        b'\xed' : 'i',
608        b'\xee' : 'i',
609        b'\xef' : 'i',
610        b'\xf0' : 'o',
611        b'\xf1' : 'n',
612        b'\xf2' : 'o',
613        b'\xf3' : 'o',
614        b'\xf4' : 'o',
615        b'\xf5' : 'o',
616        b'\xf6' : 'o',
617        b'\xf7' : '/',
618        b'\xf8' : 'o',
619        b'\xf9' : 'u',
620        b'\xfa' : 'u',
621        b'\xfb' : 'u',
622        b'\xfc' : 'u',
623        b'\xfd' : 'y',
624        b'\xfe' : 'b',
625        b'\xff' : 'y',
626        }
627
628    # A map used when removing rogue Windows-1252/ISO-8859-1
629    # characters in otherwise UTF-8 documents.
630    #
631    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
632    # Windows-1252.
633    WINDOWS_1252_TO_UTF8 = {
634        0x80 : b'\xe2\x82\xac', # €
635        0x82 : b'\xe2\x80\x9a', # ‚
636        0x83 : b'\xc6\x92',     # ƒ
637        0x84 : b'\xe2\x80\x9e', # „
638        0x85 : b'\xe2\x80\xa6', # …
639        0x86 : b'\xe2\x80\xa0', # †
640        0x87 : b'\xe2\x80\xa1', # ‡
641        0x88 : b'\xcb\x86',     # ˆ
642        0x89 : b'\xe2\x80\xb0', # ‰
643        0x8a : b'\xc5\xa0',     # Š
644        0x8b : b'\xe2\x80\xb9', # ‹
645        0x8c : b'\xc5\x92',     # Œ
646        0x8e : b'\xc5\xbd',     # Ž
647        0x91 : b'\xe2\x80\x98', # ‘
648        0x92 : b'\xe2\x80\x99', # ’
649        0x93 : b'\xe2\x80\x9c', # “
650        0x94 : b'\xe2\x80\x9d', # ”
651        0x95 : b'\xe2\x80\xa2', # •
652        0x96 : b'\xe2\x80\x93', # –
653        0x97 : b'\xe2\x80\x94', # —
654        0x98 : b'\xcb\x9c',     # ˜
655        0x99 : b'\xe2\x84\xa2', # ™
656        0x9a : b'\xc5\xa1',     # š
657        0x9b : b'\xe2\x80\xba', # ›
658        0x9c : b'\xc5\x93',     # œ
659        0x9e : b'\xc5\xbe',     # ž
660        0x9f : b'\xc5\xb8',     # Ÿ
661        0xa0 : b'\xc2\xa0',     #  
662        0xa1 : b'\xc2\xa1',     # ¡
663        0xa2 : b'\xc2\xa2',     # ¢
664        0xa3 : b'\xc2\xa3',     # £
665        0xa4 : b'\xc2\xa4',     # ¤
666        0xa5 : b'\xc2\xa5',     # ¥
667        0xa6 : b'\xc2\xa6',     # ¦
668        0xa7 : b'\xc2\xa7',     # §
669        0xa8 : b'\xc2\xa8',     # ¨
670        0xa9 : b'\xc2\xa9',     # ©
671        0xaa : b'\xc2\xaa',     # ª
672        0xab : b'\xc2\xab',     # «
673        0xac : b'\xc2\xac',     # ¬
674        0xad : b'\xc2\xad',     # ­
675        0xae : b'\xc2\xae',     # ®
676        0xaf : b'\xc2\xaf',     # ¯
677        0xb0 : b'\xc2\xb0',     # °
678        0xb1 : b'\xc2\xb1',     # ±
679        0xb2 : b'\xc2\xb2',     # ²
680        0xb3 : b'\xc2\xb3',     # ³
681        0xb4 : b'\xc2\xb4',     # ´
682        0xb5 : b'\xc2\xb5',     # µ
683        0xb6 : b'\xc2\xb6',     # ¶
684        0xb7 : b'\xc2\xb7',     # ·
685        0xb8 : b'\xc2\xb8',     # ¸
686        0xb9 : b'\xc2\xb9',     # ¹
687        0xba : b'\xc2\xba',     # º
688        0xbb : b'\xc2\xbb',     # »
689        0xbc : b'\xc2\xbc',     # ¼
690        0xbd : b'\xc2\xbd',     # ½
691        0xbe : b'\xc2\xbe',     # ¾
692        0xbf : b'\xc2\xbf',     # ¿
693        0xc0 : b'\xc3\x80',     # À
694        0xc1 : b'\xc3\x81',     # Á
695        0xc2 : b'\xc3\x82',     # Â
696        0xc3 : b'\xc3\x83',     # Ã
697        0xc4 : b'\xc3\x84',     # Ä
698        0xc5 : b'\xc3\x85',     # Å
699        0xc6 : b'\xc3\x86',     # Æ
700        0xc7 : b'\xc3\x87',     # Ç
701        0xc8 : b'\xc3\x88',     # È
702        0xc9 : b'\xc3\x89',     # É
703        0xca : b'\xc3\x8a',     # Ê
704        0xcb : b'\xc3\x8b',     # Ë
705        0xcc : b'\xc3\x8c',     # Ì
706        0xcd : b'\xc3\x8d',     # Í
707        0xce : b'\xc3\x8e',     # Î
708        0xcf : b'\xc3\x8f',     # Ï
709        0xd0 : b'\xc3\x90',     # Ð
710        0xd1 : b'\xc3\x91',     # Ñ
711        0xd2 : b'\xc3\x92',     # Ò
712        0xd3 : b'\xc3\x93',     # Ó
713        0xd4 : b'\xc3\x94',     # Ô
714        0xd5 : b'\xc3\x95',     # Õ
715        0xd6 : b'\xc3\x96',     # Ö
716        0xd7 : b'\xc3\x97',     # ×
717        0xd8 : b'\xc3\x98',     # Ø
718        0xd9 : b'\xc3\x99',     # Ù
719        0xda : b'\xc3\x9a',     # Ú
720        0xdb : b'\xc3\x9b',     # Û
721        0xdc : b'\xc3\x9c',     # Ü
722        0xdd : b'\xc3\x9d',     # Ý
723        0xde : b'\xc3\x9e',     # Þ
724        0xdf : b'\xc3\x9f',     # ß
725        0xe0 : b'\xc3\xa0',     # à
726        0xe1 : b'\xa1',     # á
727        0xe2 : b'\xc3\xa2',     # â
728        0xe3 : b'\xc3\xa3',     # ã
729        0xe4 : b'\xc3\xa4',     # ä
730        0xe5 : b'\xc3\xa5',     # å
731        0xe6 : b'\xc3\xa6',     # æ
732        0xe7 : b'\xc3\xa7',     # ç
733        0xe8 : b'\xc3\xa8',     # è
734        0xe9 : b'\xc3\xa9',     # é
735        0xea : b'\xc3\xaa',     # ê
736        0xeb : b'\xc3\xab',     # ë
737        0xec : b'\xc3\xac',     # ì
738        0xed : b'\xc3\xad',     # í
739        0xee : b'\xc3\xae',     # î
740        0xef : b'\xc3\xaf',     # ï
741        0xf0 : b'\xc3\xb0',     # ð
742        0xf1 : b'\xc3\xb1',     # ñ
743        0xf2 : b'\xc3\xb2',     # ò
744        0xf3 : b'\xc3\xb3',     # ó
745        0xf4 : b'\xc3\xb4',     # ô
746        0xf5 : b'\xc3\xb5',     # õ
747        0xf6 : b'\xc3\xb6',     # ö
748        0xf7 : b'\xc3\xb7',     # ÷
749        0xf8 : b'\xc3\xb8',     # ø
750        0xf9 : b'\xc3\xb9',     # ù
751        0xfa : b'\xc3\xba',     # ú
752        0xfb : b'\xc3\xbb',     # û
753        0xfc : b'\xc3\xbc',     # ü
754        0xfd : b'\xc3\xbd',     # ý
755        0xfe : b'\xc3\xbe',     # þ
756        }
757
758    MULTIBYTE_MARKERS_AND_SIZES = [
759        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
760        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
761        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
762        ]
763
764    FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
765    LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
766
767    @classmethod
768    def detwingle(cls, in_bytes, main_encoding="utf8",
769                  embedded_encoding="windows-1252"):
770        """Fix characters from one encoding embedded in some other encoding.
771
772        Currently the only situation supported is Windows-1252 (or its
773        subset ISO-8859-1), embedded in UTF-8.
774
775        The input must be a bytestring. If you've already converted
776        the document to Unicode, you're too late.
777
778        The output is a bytestring in which `embedded_encoding`
779        characters have been converted to their `main_encoding`
780        equivalents.
781        """
782        if embedded_encoding.replace('_', '-').lower() not in (
783            'windows-1252', 'windows_1252'):
784            raise NotImplementedError(
785                "Windows-1252 and ISO-8859-1 are the only currently supported "
786                "embedded encodings.")
787
788        if main_encoding.lower() not in ('utf8', 'utf-8'):
789            raise NotImplementedError(
790                "UTF-8 is the only currently supported main encoding.")
791
792        byte_chunks = []
793
794        chunk_start = 0
795        pos = 0
796        while pos < len(in_bytes):
797            byte = in_bytes[pos]
798            if not isinstance(byte, int):
799                # Python 2.x
800                byte = ord(byte)
801            if (byte >= cls.FIRST_MULTIBYTE_MARKER
802                and byte <= cls.LAST_MULTIBYTE_MARKER):
803                # This is the start of a UTF-8 multibyte character. Skip
804                # to the end.
805                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
806                    if byte >= start and byte <= end:
807                        pos += size
808                        break
809            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
810                # We found a Windows-1252 character!
811                # Save the string up to this point as a chunk.
812                byte_chunks.append(in_bytes[chunk_start:pos])
813
814                # Now translate the Windows-1252 character into UTF-8
815                # and add it as another, one-byte chunk.
816                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
817                pos += 1
818                chunk_start = pos
819            else:
820                # Go on to the next character.
821                pos += 1
822        if chunk_start == 0:
823            # The string is unchanged.
824            return in_bytes
825        else:
826            # Store the final chunk.
827            byte_chunks.append(in_bytes[chunk_start:])
828        return b''.join(byte_chunks)
829
830