• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Text wrapping and filling.
2"""
3
4# Copyright (C) 1999-2001 Gregory P. Ward.
5# Copyright (C) 2002, 2003 Python Software Foundation.
6# Written by Greg Ward <gward@python.net>
7
8__revision__ = "$Id$"
9
10import string, re
11
12# Do the right thing with boolean values for all known Python versions
13# (so this module can be copied to projects that don't depend on Python
14# 2.3, e.g. Optik and Docutils) by uncommenting the block of code below.
15#try:
16#    True, False
17#except NameError:
18#    (True, False) = (1, 0)
19
20__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent']
21
22# Hardcode the recognized whitespace characters to the US-ASCII
23# whitespace characters.  The main reason for doing this is that in
24# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
25# that character winds up in string.whitespace.  Respecting
26# string.whitespace in those cases would 1) make textwrap treat 0xa0 the
27# same as any other whitespace char, which is clearly wrong (it's a
28# *non-breaking* space), 2) possibly cause problems with Unicode,
29# since 0xa0 is not in range(128).
30_whitespace = '\t\n\x0b\x0c\r '
31
32class TextWrapper:
33    """
34    Object for wrapping/filling text.  The public interface consists of
35    the wrap() and fill() methods; the other methods are just there for
36    subclasses to override in order to tweak the default behaviour.
37    If you want to completely replace the main wrapping algorithm,
38    you'll probably have to override _wrap_chunks().
39
40    Several instance attributes control various aspects of wrapping:
41      width (default: 70)
42        the maximum width of wrapped lines (unless break_long_words
43        is false)
44      initial_indent (default: "")
45        string that will be prepended to the first line of wrapped
46        output.  Counts towards the line's width.
47      subsequent_indent (default: "")
48        string that will be prepended to all lines save the first
49        of wrapped output; also counts towards each line's width.
50      expand_tabs (default: true)
51        Expand tabs in input text to spaces before further processing.
52        Each tab will become 1 .. 8 spaces, depending on its position in
53        its line.  If false, each tab is treated as a single character.
54      replace_whitespace (default: true)
55        Replace all whitespace characters in the input text by spaces
56        after tab expansion.  Note that if expand_tabs is false and
57        replace_whitespace is true, every tab will be converted to a
58        single space!
59      fix_sentence_endings (default: false)
60        Ensure that sentence-ending punctuation is always followed
61        by two spaces.  Off by default because the algorithm is
62        (unavoidably) imperfect.
63      break_long_words (default: true)
64        Break words longer than 'width'.  If false, those words will not
65        be broken, and some lines might be longer than 'width'.
66      break_on_hyphens (default: true)
67        Allow breaking hyphenated words. If true, wrapping will occur
68        preferably on whitespaces and right after hyphens part of
69        compound words.
70      drop_whitespace (default: true)
71        Drop leading and trailing whitespace from lines.
72    """
73
74    whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))
75
76    unicode_whitespace_trans = {}
77    uspace = ord(u' ')
78    for x in map(ord, _whitespace):
79        unicode_whitespace_trans[x] = uspace
80
81    # This funky little regex is just the trick for splitting
82    # text up into word-wrappable chunks.  E.g.
83    #   "Hello there -- you goof-ball, use the -b option!"
84    # splits into
85    #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
86    # (after stripping out empty strings).
87    wordsep_re = re.compile(
88        r'(\s+|'                                  # any whitespace
89        r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|'   # hyphenated words
90        r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash
91
92    # This less funky little regex just split on recognized spaces. E.g.
93    #   "Hello there -- you goof-ball, use the -b option!"
94    # splits into
95    #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
96    wordsep_simple_re = re.compile(r'(\s+)')
97
98    # XXX this is not locale- or charset-aware -- string.lowercase
99    # is US-ASCII only (and therefore English-only)
100    sentence_end_re = re.compile(r'[%s]'              # lowercase letter
101                                 r'[\.\!\?]'          # sentence-ending punct.
102                                 r'[\"\']?'           # optional end-of-quote
103                                 r'\Z'                # end of chunk
104                                 % string.lowercase)
105
106
107    def __init__(self,
108                 width=70,
109                 initial_indent="",
110                 subsequent_indent="",
111                 expand_tabs=True,
112                 replace_whitespace=True,
113                 fix_sentence_endings=False,
114                 break_long_words=True,
115                 drop_whitespace=True,
116                 break_on_hyphens=True):
117        self.width = width
118        self.initial_indent = initial_indent
119        self.subsequent_indent = subsequent_indent
120        self.expand_tabs = expand_tabs
121        self.replace_whitespace = replace_whitespace
122        self.fix_sentence_endings = fix_sentence_endings
123        self.break_long_words = break_long_words
124        self.drop_whitespace = drop_whitespace
125        self.break_on_hyphens = break_on_hyphens
126
127        # recompile the regexes for Unicode mode -- done in this clumsy way for
128        # backwards compatibility because it's rather common to monkey-patch
129        # the TextWrapper class' wordsep_re attribute.
130        self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U)
131        self.wordsep_simple_re_uni = re.compile(
132            self.wordsep_simple_re.pattern, re.U)
133
134
135    # -- Private methods -----------------------------------------------
136    # (possibly useful for subclasses to override)
137
138    def _munge_whitespace(self, text):
139        """_munge_whitespace(text : string) -> string
140
141        Munge whitespace in text: expand tabs and convert all other
142        whitespace characters to spaces.  Eg. " foo\tbar\n\nbaz"
143        becomes " foo    bar  baz".
144        """
145        if self.expand_tabs:
146            text = text.expandtabs()
147        if self.replace_whitespace:
148            if isinstance(text, str):
149                text = text.translate(self.whitespace_trans)
150            elif isinstance(text, unicode):
151                text = text.translate(self.unicode_whitespace_trans)
152        return text
153
154
155    def _split(self, text):
156        """_split(text : string) -> [string]
157
158        Split the text to wrap into indivisible chunks.  Chunks are
159        not quite the same as words; see _wrap_chunks() for full
160        details.  As an example, the text
161          Look, goof-ball -- use the -b option!
162        breaks into the following chunks:
163          'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
164          'use', ' ', 'the', ' ', '-b', ' ', 'option!'
165        if break_on_hyphens is True, or in:
166          'Look,', ' ', 'goof-ball', ' ', '--', ' ',
167          'use', ' ', 'the', ' ', '-b', ' ', option!'
168        otherwise.
169        """
170        if isinstance(text, unicode):
171            if self.break_on_hyphens:
172                pat = self.wordsep_re_uni
173            else:
174                pat = self.wordsep_simple_re_uni
175        else:
176            if self.break_on_hyphens:
177                pat = self.wordsep_re
178            else:
179                pat = self.wordsep_simple_re
180        chunks = pat.split(text)
181        chunks = filter(None, chunks)  # remove empty chunks
182        return chunks
183
184    def _fix_sentence_endings(self, chunks):
185        """_fix_sentence_endings(chunks : [string])
186
187        Correct for sentence endings buried in 'chunks'.  Eg. when the
188        original text contains "... foo.\nBar ...", munge_whitespace()
189        and split() will convert that to [..., "foo.", " ", "Bar", ...]
190        which has one too few spaces; this method simply changes the one
191        space to two.
192        """
193        i = 0
194        patsearch = self.sentence_end_re.search
195        while i < len(chunks)-1:
196            if chunks[i+1] == " " and patsearch(chunks[i]):
197                chunks[i+1] = "  "
198                i += 2
199            else:
200                i += 1
201
202    def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
203        """_handle_long_word(chunks : [string],
204                             cur_line : [string],
205                             cur_len : int, width : int)
206
207        Handle a chunk of text (most likely a word, not whitespace) that
208        is too long to fit in any line.
209        """
210        # Figure out when indent is larger than the specified width, and make
211        # sure at least one character is stripped off on every pass
212        if width < 1:
213            space_left = 1
214        else:
215            space_left = width - cur_len
216
217        # If we're allowed to break long words, then do so: put as much
218        # of the next chunk onto the current line as will fit.
219        if self.break_long_words:
220            cur_line.append(reversed_chunks[-1][:space_left])
221            reversed_chunks[-1] = reversed_chunks[-1][space_left:]
222
223        # Otherwise, we have to preserve the long word intact.  Only add
224        # it to the current line if there's nothing already there --
225        # that minimizes how much we violate the width constraint.
226        elif not cur_line:
227            cur_line.append(reversed_chunks.pop())
228
229        # If we're not allowed to break long words, and there's already
230        # text on the current line, do nothing.  Next time through the
231        # main loop of _wrap_chunks(), we'll wind up here again, but
232        # cur_len will be zero, so the next line will be entirely
233        # devoted to the long word that we can't handle right now.
234
235    def _wrap_chunks(self, chunks):
236        """_wrap_chunks(chunks : [string]) -> [string]
237
238        Wrap a sequence of text chunks and return a list of lines of
239        length 'self.width' or less.  (If 'break_long_words' is false,
240        some lines may be longer than this.)  Chunks correspond roughly
241        to words and the whitespace between them: each chunk is
242        indivisible (modulo 'break_long_words'), but a line break can
243        come between any two chunks.  Chunks should not have internal
244        whitespace; ie. a chunk is either all whitespace or a "word".
245        Whitespace chunks will be removed from the beginning and end of
246        lines, but apart from that whitespace is preserved.
247        """
248        lines = []
249        if self.width <= 0:
250            raise ValueError("invalid width %r (must be > 0)" % self.width)
251
252        # Arrange in reverse order so items can be efficiently popped
253        # from a stack of chucks.
254        chunks.reverse()
255
256        while chunks:
257
258            # Start the list of chunks that will make up the current line.
259            # cur_len is just the length of all the chunks in cur_line.
260            cur_line = []
261            cur_len = 0
262
263            # Figure out which static string will prefix this line.
264            if lines:
265                indent = self.subsequent_indent
266            else:
267                indent = self.initial_indent
268
269            # Maximum width for this line.
270            width = self.width - len(indent)
271
272            # First chunk on line is whitespace -- drop it, unless this
273            # is the very beginning of the text (ie. no lines started yet).
274            if self.drop_whitespace and chunks[-1].strip() == '' and lines:
275                del chunks[-1]
276
277            while chunks:
278                l = len(chunks[-1])
279
280                # Can at least squeeze this chunk onto the current line.
281                if cur_len + l <= width:
282                    cur_line.append(chunks.pop())
283                    cur_len += l
284
285                # Nope, this line is full.
286                else:
287                    break
288
289            # The current line is full, and the next chunk is too big to
290            # fit on *any* line (not just this one).
291            if chunks and len(chunks[-1]) > width:
292                self._handle_long_word(chunks, cur_line, cur_len, width)
293
294            # If the last chunk on this line is all whitespace, drop it.
295            if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
296                del cur_line[-1]
297
298            # Convert current line back to a string and store it in list
299            # of all lines (return value).
300            if cur_line:
301                lines.append(indent + ''.join(cur_line))
302
303        return lines
304
305
306    # -- Public interface ----------------------------------------------
307
308    def wrap(self, text):
309        """wrap(text : string) -> [string]
310
311        Reformat the single paragraph in 'text' so it fits in lines of
312        no more than 'self.width' columns, and return a list of wrapped
313        lines.  Tabs in 'text' are expanded with string.expandtabs(),
314        and all other whitespace characters (including newline) are
315        converted to space.
316        """
317        text = self._munge_whitespace(text)
318        chunks = self._split(text)
319        if self.fix_sentence_endings:
320            self._fix_sentence_endings(chunks)
321        return self._wrap_chunks(chunks)
322
323    def fill(self, text):
324        """fill(text : string) -> string
325
326        Reformat the single paragraph in 'text' to fit in lines of no
327        more than 'self.width' columns, and return a new string
328        containing the entire wrapped paragraph.
329        """
330        return "\n".join(self.wrap(text))
331
332
333# -- Convenience interface ---------------------------------------------
334
335def wrap(text, width=70, **kwargs):
336    """Wrap a single paragraph of text, returning a list of wrapped lines.
337
338    Reformat the single paragraph in 'text' so it fits in lines of no
339    more than 'width' columns, and return a list of wrapped lines.  By
340    default, tabs in 'text' are expanded with string.expandtabs(), and
341    all other whitespace characters (including newline) are converted to
342    space.  See TextWrapper class for available keyword args to customize
343    wrapping behaviour.
344    """
345    w = TextWrapper(width=width, **kwargs)
346    return w.wrap(text)
347
348def fill(text, width=70, **kwargs):
349    """Fill a single paragraph of text, returning a new string.
350
351    Reformat the single paragraph in 'text' to fit in lines of no more
352    than 'width' columns, and return a new string containing the entire
353    wrapped paragraph.  As with wrap(), tabs are expanded and other
354    whitespace characters converted to space.  See TextWrapper class for
355    available keyword args to customize wrapping behaviour.
356    """
357    w = TextWrapper(width=width, **kwargs)
358    return w.fill(text)
359
360
361# -- Loosely related functionality -------------------------------------
362
363_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
364_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
365
366def dedent(text):
367    """Remove any common leading whitespace from every line in `text`.
368
369    This can be used to make triple-quoted strings line up with the left
370    edge of the display, while still presenting them in the source code
371    in indented form.
372
373    Note that tabs and spaces are both treated as whitespace, but they
374    are not equal: the lines "  hello" and "\thello" are
375    considered to have no common leading whitespace.  (This behaviour is
376    new in Python 2.5; older versions of this module incorrectly
377    expanded tabs before searching for common leading whitespace.)
378    """
379    # Look for the longest leading string of spaces and tabs common to
380    # all lines.
381    margin = None
382    text = _whitespace_only_re.sub('', text)
383    indents = _leading_whitespace_re.findall(text)
384    for indent in indents:
385        if margin is None:
386            margin = indent
387
388        # Current line more deeply indented than previous winner:
389        # no change (previous winner is still on top).
390        elif indent.startswith(margin):
391            pass
392
393        # Current line consistent with and no deeper than previous winner:
394        # it's the new winner.
395        elif margin.startswith(indent):
396            margin = indent
397
398        # Current line and previous winner have no common whitespace:
399        # there is no margin.
400        else:
401            margin = ""
402            break
403
404    # sanity check (testing/debugging only)
405    if 0 and margin:
406        for line in text.split("\n"):
407            assert not line or line.startswith(margin), \
408                   "line = %r, margin = %r" % (line, margin)
409
410    if margin:
411        text = re.sub(r'(?m)^' + margin, '', text)
412    return text
413
414if __name__ == "__main__":
415    #print dedent("\tfoo\n\tbar")
416    #print dedent("  \thello there\n  \t  how are you?")
417    print dedent("Hello there.\n  This is indented.")
418