• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Text wrapping and filling.
2"""
3
4# Copyright (C) 1999-2001 Gregory P. Ward.
5# Copyright (C) 2002, 2003 Python Software Foundation.
6# Written by Greg Ward <gward@python.net>
7
8__revision__ = "$Id$"
9
10import string, re
11
12try:
13    _unicode = unicode
14except NameError:
15    # If Python is built without Unicode support, the unicode type
16    # will not exist. Fake one.
17    class _unicode(object):
18        pass
19
20# Do the right thing with boolean values for all known Python versions
21# (so this module can be copied to projects that don't depend on Python
22# 2.3, e.g. Optik and Docutils) by uncommenting the block of code below.
23#try:
24#    True, False
25#except NameError:
26#    (True, False) = (1, 0)
27
28__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent']
29
30# Hardcode the recognized whitespace characters to the US-ASCII
31# whitespace characters.  The main reason for doing this is that in
32# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
33# that character winds up in string.whitespace.  Respecting
34# string.whitespace in those cases would 1) make textwrap treat 0xa0 the
35# same as any other whitespace char, which is clearly wrong (it's a
36# *non-breaking* space), 2) possibly cause problems with Unicode,
37# since 0xa0 is not in range(128).
38_whitespace = '\t\n\x0b\x0c\r '
39
40class TextWrapper:
41    """
42    Object for wrapping/filling text.  The public interface consists of
43    the wrap() and fill() methods; the other methods are just there for
44    subclasses to override in order to tweak the default behaviour.
45    If you want to completely replace the main wrapping algorithm,
46    you'll probably have to override _wrap_chunks().
47
48    Several instance attributes control various aspects of wrapping:
49      width (default: 70)
50        the maximum width of wrapped lines (unless break_long_words
51        is false)
52      initial_indent (default: "")
53        string that will be prepended to the first line of wrapped
54        output.  Counts towards the line's width.
55      subsequent_indent (default: "")
56        string that will be prepended to all lines save the first
57        of wrapped output; also counts towards each line's width.
58      expand_tabs (default: true)
59        Expand tabs in input text to spaces before further processing.
60        Each tab will become 1 .. 8 spaces, depending on its position in
61        its line.  If false, each tab is treated as a single character.
62      replace_whitespace (default: true)
63        Replace all whitespace characters in the input text by spaces
64        after tab expansion.  Note that if expand_tabs is false and
65        replace_whitespace is true, every tab will be converted to a
66        single space!
67      fix_sentence_endings (default: false)
68        Ensure that sentence-ending punctuation is always followed
69        by two spaces.  Off by default because the algorithm is
70        (unavoidably) imperfect.
71      break_long_words (default: true)
72        Break words longer than 'width'.  If false, those words will not
73        be broken, and some lines might be longer than 'width'.
74      break_on_hyphens (default: true)
75        Allow breaking hyphenated words. If true, wrapping will occur
76        preferably on whitespaces and right after hyphens part of
77        compound words.
78      drop_whitespace (default: true)
79        Drop leading and trailing whitespace from lines.
80    """
81
82    whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))
83
84    unicode_whitespace_trans = {}
85    uspace = ord(u' ')
86    for x in map(ord, _whitespace):
87        unicode_whitespace_trans[x] = uspace
88
89    # This funky little regex is just the trick for splitting
90    # text up into word-wrappable chunks.  E.g.
91    #   "Hello there -- you goof-ball, use the -b option!"
92    # splits into
93    #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
94    # (after stripping out empty strings).
95    wordsep_re = re.compile(
96        r'(\s+|'                                  # any whitespace
97        r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|'   # hyphenated words
98        r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash
99
100    # This less funky little regex just split on recognized spaces. E.g.
101    #   "Hello there -- you goof-ball, use the -b option!"
102    # splits into
103    #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
104    wordsep_simple_re = re.compile(r'(\s+)')
105
106    # XXX this is not locale- or charset-aware -- string.lowercase
107    # is US-ASCII only (and therefore English-only)
108    sentence_end_re = re.compile(r'[%s]'              # lowercase letter
109                                 r'[\.\!\?]'          # sentence-ending punct.
110                                 r'[\"\']?'           # optional end-of-quote
111                                 r'\Z'                # end of chunk
112                                 % string.lowercase)
113
114
115    def __init__(self,
116                 width=70,
117                 initial_indent="",
118                 subsequent_indent="",
119                 expand_tabs=True,
120                 replace_whitespace=True,
121                 fix_sentence_endings=False,
122                 break_long_words=True,
123                 drop_whitespace=True,
124                 break_on_hyphens=True):
125        self.width = width
126        self.initial_indent = initial_indent
127        self.subsequent_indent = subsequent_indent
128        self.expand_tabs = expand_tabs
129        self.replace_whitespace = replace_whitespace
130        self.fix_sentence_endings = fix_sentence_endings
131        self.break_long_words = break_long_words
132        self.drop_whitespace = drop_whitespace
133        self.break_on_hyphens = break_on_hyphens
134
135        # recompile the regexes for Unicode mode -- done in this clumsy way for
136        # backwards compatibility because it's rather common to monkey-patch
137        # the TextWrapper class' wordsep_re attribute.
138        self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U)
139        self.wordsep_simple_re_uni = re.compile(
140            self.wordsep_simple_re.pattern, re.U)
141
142
143    # -- Private methods -----------------------------------------------
144    # (possibly useful for subclasses to override)
145
146    def _munge_whitespace(self, text):
147        """_munge_whitespace(text : string) -> string
148
149        Munge whitespace in text: expand tabs and convert all other
150        whitespace characters to spaces.  Eg. " foo\\tbar\\n\\nbaz"
151        becomes " foo    bar  baz".
152        """
153        if self.expand_tabs:
154            text = text.expandtabs()
155        if self.replace_whitespace:
156            if isinstance(text, str):
157                text = text.translate(self.whitespace_trans)
158            elif isinstance(text, _unicode):
159                text = text.translate(self.unicode_whitespace_trans)
160        return text
161
162
163    def _split(self, text):
164        """_split(text : string) -> [string]
165
166        Split the text to wrap into indivisible chunks.  Chunks are
167        not quite the same as words; see _wrap_chunks() for full
168        details.  As an example, the text
169          Look, goof-ball -- use the -b option!
170        breaks into the following chunks:
171          'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
172          'use', ' ', 'the', ' ', '-b', ' ', 'option!'
173        if break_on_hyphens is True, or in:
174          'Look,', ' ', 'goof-ball', ' ', '--', ' ',
175          'use', ' ', 'the', ' ', '-b', ' ', option!'
176        otherwise.
177        """
178        if isinstance(text, _unicode):
179            if self.break_on_hyphens:
180                pat = self.wordsep_re_uni
181            else:
182                pat = self.wordsep_simple_re_uni
183        else:
184            if self.break_on_hyphens:
185                pat = self.wordsep_re
186            else:
187                pat = self.wordsep_simple_re
188        chunks = pat.split(text)
189        chunks = filter(None, chunks)  # remove empty chunks
190        return chunks
191
192    def _fix_sentence_endings(self, chunks):
193        """_fix_sentence_endings(chunks : [string])
194
195        Correct for sentence endings buried in 'chunks'.  Eg. when the
196        original text contains "... foo.\\nBar ...", munge_whitespace()
197        and split() will convert that to [..., "foo.", " ", "Bar", ...]
198        which has one too few spaces; this method simply changes the one
199        space to two.
200        """
201        i = 0
202        patsearch = self.sentence_end_re.search
203        while i < len(chunks)-1:
204            if chunks[i+1] == " " and patsearch(chunks[i]):
205                chunks[i+1] = "  "
206                i += 2
207            else:
208                i += 1
209
210    def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
211        """_handle_long_word(chunks : [string],
212                             cur_line : [string],
213                             cur_len : int, width : int)
214
215        Handle a chunk of text (most likely a word, not whitespace) that
216        is too long to fit in any line.
217        """
218        # Figure out when indent is larger than the specified width, and make
219        # sure at least one character is stripped off on every pass
220        if width < 1:
221            space_left = 1
222        else:
223            space_left = width - cur_len
224
225        # If we're allowed to break long words, then do so: put as much
226        # of the next chunk onto the current line as will fit.
227        if self.break_long_words:
228            cur_line.append(reversed_chunks[-1][:space_left])
229            reversed_chunks[-1] = reversed_chunks[-1][space_left:]
230
231        # Otherwise, we have to preserve the long word intact.  Only add
232        # it to the current line if there's nothing already there --
233        # that minimizes how much we violate the width constraint.
234        elif not cur_line:
235            cur_line.append(reversed_chunks.pop())
236
237        # If we're not allowed to break long words, and there's already
238        # text on the current line, do nothing.  Next time through the
239        # main loop of _wrap_chunks(), we'll wind up here again, but
240        # cur_len will be zero, so the next line will be entirely
241        # devoted to the long word that we can't handle right now.
242
243    def _wrap_chunks(self, chunks):
244        """_wrap_chunks(chunks : [string]) -> [string]
245
246        Wrap a sequence of text chunks and return a list of lines of
247        length 'self.width' or less.  (If 'break_long_words' is false,
248        some lines may be longer than this.)  Chunks correspond roughly
249        to words and the whitespace between them: each chunk is
250        indivisible (modulo 'break_long_words'), but a line break can
251        come between any two chunks.  Chunks should not have internal
252        whitespace; ie. a chunk is either all whitespace or a "word".
253        Whitespace chunks will be removed from the beginning and end of
254        lines, but apart from that whitespace is preserved.
255        """
256        lines = []
257        if self.width <= 0:
258            raise ValueError("invalid width %r (must be > 0)" % self.width)
259
260        # Arrange in reverse order so items can be efficiently popped
261        # from a stack of chucks.
262        chunks.reverse()
263
264        while chunks:
265
266            # Start the list of chunks that will make up the current line.
267            # cur_len is just the length of all the chunks in cur_line.
268            cur_line = []
269            cur_len = 0
270
271            # Figure out which static string will prefix this line.
272            if lines:
273                indent = self.subsequent_indent
274            else:
275                indent = self.initial_indent
276
277            # Maximum width for this line.
278            width = self.width - len(indent)
279
280            # First chunk on line is whitespace -- drop it, unless this
281            # is the very beginning of the text (ie. no lines started yet).
282            if self.drop_whitespace and chunks[-1].strip() == '' and lines:
283                del chunks[-1]
284
285            while chunks:
286                l = len(chunks[-1])
287
288                # Can at least squeeze this chunk onto the current line.
289                if cur_len + l <= width:
290                    cur_line.append(chunks.pop())
291                    cur_len += l
292
293                # Nope, this line is full.
294                else:
295                    break
296
297            # The current line is full, and the next chunk is too big to
298            # fit on *any* line (not just this one).
299            if chunks and len(chunks[-1]) > width:
300                self._handle_long_word(chunks, cur_line, cur_len, width)
301
302            # If the last chunk on this line is all whitespace, drop it.
303            if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
304                del cur_line[-1]
305
306            # Convert current line back to a string and store it in list
307            # of all lines (return value).
308            if cur_line:
309                lines.append(indent + ''.join(cur_line))
310
311        return lines
312
313
314    # -- Public interface ----------------------------------------------
315
316    def wrap(self, text):
317        """wrap(text : string) -> [string]
318
319        Reformat the single paragraph in 'text' so it fits in lines of
320        no more than 'self.width' columns, and return a list of wrapped
321        lines.  Tabs in 'text' are expanded with string.expandtabs(),
322        and all other whitespace characters (including newline) are
323        converted to space.
324        """
325        text = self._munge_whitespace(text)
326        chunks = self._split(text)
327        if self.fix_sentence_endings:
328            self._fix_sentence_endings(chunks)
329        return self._wrap_chunks(chunks)
330
331    def fill(self, text):
332        """fill(text : string) -> string
333
334        Reformat the single paragraph in 'text' to fit in lines of no
335        more than 'self.width' columns, and return a new string
336        containing the entire wrapped paragraph.
337        """
338        return "\n".join(self.wrap(text))
339
340
341# -- Convenience interface ---------------------------------------------
342
343def wrap(text, width=70, **kwargs):
344    """Wrap a single paragraph of text, returning a list of wrapped lines.
345
346    Reformat the single paragraph in 'text' so it fits in lines of no
347    more than 'width' columns, and return a list of wrapped lines.  By
348    default, tabs in 'text' are expanded with string.expandtabs(), and
349    all other whitespace characters (including newline) are converted to
350    space.  See TextWrapper class for available keyword args to customize
351    wrapping behaviour.
352    """
353    w = TextWrapper(width=width, **kwargs)
354    return w.wrap(text)
355
356def fill(text, width=70, **kwargs):
357    """Fill a single paragraph of text, returning a new string.
358
359    Reformat the single paragraph in 'text' to fit in lines of no more
360    than 'width' columns, and return a new string containing the entire
361    wrapped paragraph.  As with wrap(), tabs are expanded and other
362    whitespace characters converted to space.  See TextWrapper class for
363    available keyword args to customize wrapping behaviour.
364    """
365    w = TextWrapper(width=width, **kwargs)
366    return w.fill(text)
367
368
369# -- Loosely related functionality -------------------------------------
370
371_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
372_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
373
374def dedent(text):
375    """Remove any common leading whitespace from every line in `text`.
376
377    This can be used to make triple-quoted strings line up with the left
378    edge of the display, while still presenting them in the source code
379    in indented form.
380
381    Note that tabs and spaces are both treated as whitespace, but they
382    are not equal: the lines "  hello" and "\\thello" are
383    considered to have no common leading whitespace.  (This behaviour is
384    new in Python 2.5; older versions of this module incorrectly
385    expanded tabs before searching for common leading whitespace.)
386    """
387    # Look for the longest leading string of spaces and tabs common to
388    # all lines.
389    margin = None
390    text = _whitespace_only_re.sub('', text)
391    indents = _leading_whitespace_re.findall(text)
392    for indent in indents:
393        if margin is None:
394            margin = indent
395
396        # Current line more deeply indented than previous winner:
397        # no change (previous winner is still on top).
398        elif indent.startswith(margin):
399            pass
400
401        # Current line consistent with and no deeper than previous winner:
402        # it's the new winner.
403        elif margin.startswith(indent):
404            margin = indent
405
406        # Find the largest common whitespace between current line and previous
407        # winner.
408        else:
409            for i, (x, y) in enumerate(zip(margin, indent)):
410                if x != y:
411                    margin = margin[:i]
412                    break
413            else:
414                margin = margin[:len(indent)]
415
416    # sanity check (testing/debugging only)
417    if 0 and margin:
418        for line in text.split("\n"):
419            assert not line or line.startswith(margin), \
420                   "line = %r, margin = %r" % (line, margin)
421
422    if margin:
423        text = re.sub(r'(?m)^' + margin, '', text)
424    return text
425
426if __name__ == "__main__":
427    #print dedent("\tfoo\n\tbar")
428    #print dedent("  \thello there\n  \t  how are you?")
429    print dedent("Hello there.\n  This is indented.")
430