• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Authors: John Dennis <jdennis@redhat.com>
2#
3# Copyright (C) 2007 Red Hat, Inc.
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 2 of the License, or
8# (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with this program; if not, write to the Free Software
17# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18#
19
20
21__all__ = [
22    'escape_html',
23    'unescape_html',
24    'html_to_text',
25
26    'html_document',
27]
28
29import htmllib
30import formatter as Formatter
31import string
32from types import *
33import StringIO
34
35#------------------------------------------------------------------------------
36
37class TextWriter(Formatter.DumbWriter):
38    def __init__(self, file=None, maxcol=80, indent_width=4):
39        Formatter.DumbWriter.__init__(self, file, maxcol)
40        self.indent_level = 0
41        self.indent_width = indent_width
42        self._set_indent()
43
44    def _set_indent(self):
45        self.indent_col = self.indent_level * self.indent_width
46        self.indent = ' ' * self.indent_col
47
48    def new_margin(self, margin, level):
49        self.indent_level = level
50        self._set_indent()
51
52    def send_label_data(self, data):
53        data = data + ' '
54        if len(data) > self.indent_col:
55            self.send_literal_data(data)
56        else:
57            offset = self.indent_col - len(data)
58            self.send_literal_data(' ' * offset + data)
59
60    def send_flowing_data(self, data):
61        if not data: return
62        atbreak = self.atbreak or data[0] in string.whitespace
63        col = self.col
64        maxcol = self.maxcol
65        write = self.file.write
66        col = self.col
67        if col == 0:
68            write(self.indent)
69            col = self.indent_col
70        for word in data.split():
71            if atbreak:
72                if col + len(word) >= maxcol:
73                    write('\n' + self.indent)
74                    col = self.indent_col
75                else:
76                    write(' ')
77                    col = col + 1
78            write(word)
79            col = col + len(word)
80            atbreak = 1
81        self.col = col
82        self.atbreak = data[-1] in string.whitespace
83
84class HTMLParserAnchor(htmllib.HTMLParser):
85
86    def __init__(self, formatter, verbose=0):
87        htmllib.HTMLParser.__init__(self, formatter, verbose)
88
89    def anchor_bgn(self, href, name, type):
90        self.anchor = href
91
92    def anchor_end(self):
93        if self.anchor:
94            self.handle_data(' (%s) ' % self.anchor)
95            self.anchor = None
96
97#------------------------------------------------------------------------------
98
99def escape_html(s):
100    if s is None: return None
101    s = s.replace("&", "&amp;") # Must be done first!
102    s = s.replace("<", "&lt;")
103    s = s.replace(">", "&gt;")
104    s = s.replace("'", "&apos;")
105    s = s.replace('"', "&quot;")
106    return s
107
108
109def unescape_html(s):
110    if s is None: return None
111    if '&' not in s:
112        return s
113    s = s.replace("&lt;", "<")
114    s = s.replace("&gt;", ">")
115    s = s.replace("&apos;", "'")
116    s = s.replace("&quot;", '"')
117    s = s.replace("&amp;", "&") # Must be last
118    return s
119
120def html_to_text(html, maxcol=80):
121    try:
122        buffer = StringIO.StringIO()
123        formatter = Formatter.AbstractFormatter(TextWriter(buffer, maxcol))
124        parser = HTMLParserAnchor(formatter)
125        parser.feed(html)
126        parser.close()
127        text = buffer.getvalue()
128        buffer.close()
129        return text
130    except Exception, e:
131        log_program.error('cannot convert html to text: %s' % e)
132        return None
133
134def html_document(*body_components):
135    '''Wrap the body components in a HTML document structure with a valid header.
136    Accepts a variable number of arguments of of which canb be:
137    * string
138    * a sequences of strings (tuple or list).
139    * a callable object taking no parameters and returning a string or sequence of strings.
140    '''
141    head = '<html>\n  <head>\n    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n  </head>\n  <body>\n'
142    tail = '\n  </body>\n</html>'
143
144    doc = head
145
146    for body_component in body_components:
147        if type(body_component) is StringTypes:
148            doc += body_component
149        elif type(body_component) in [TupleType, ListType]:
150            for item in body_component:
151                doc += item
152        elif callable(body_component):
153            result = body_component()
154            if type(result) in [TupleType, ListType]:
155                for item in result:
156                    doc += item
157            else:
158                doc += result
159        else:
160            doc += body_component
161
162    doc += tail
163    return doc
164