• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Text wrapping and filling.
2"""
3
4# Copyright (C) 1999-2001 Gregory P. Ward.
5# Copyright (C) 2002, 2003 Python Software Foundation.
6# Written by Greg Ward <gward@python.net>
7
8import re
9
10__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
11
12# Hardcode the recognized whitespace characters to the US-ASCII
13# whitespace characters.  The main reason for doing this is that
14# some Unicode spaces (like \u00a0) are non-breaking whitespaces.
15_whitespace = '\t\n\x0b\x0c\r '
16
17class TextWrapper:
18    """
19    Object for wrapping/filling text.  The public interface consists of
20    the wrap() and fill() methods; the other methods are just there for
21    subclasses to override in order to tweak the default behaviour.
22    If you want to completely replace the main wrapping algorithm,
23    you'll probably have to override _wrap_chunks().
24
25    Several instance attributes control various aspects of wrapping:
26      width (default: 70)
27        the maximum width of wrapped lines (unless break_long_words
28        is false)
29      initial_indent (default: "")
30        string that will be prepended to the first line of wrapped
31        output.  Counts towards the line's width.
32      subsequent_indent (default: "")
33        string that will be prepended to all lines save the first
34        of wrapped output; also counts towards each line's width.
35      expand_tabs (default: true)
36        Expand tabs in input text to spaces before further processing.
37        Each tab will become 0 .. 'tabsize' spaces, depending on its position
38        in its line.  If false, each tab is treated as a single character.
39      tabsize (default: 8)
40        Expand tabs in input text to 0 .. 'tabsize' spaces, unless
41        'expand_tabs' is false.
42      replace_whitespace (default: true)
43        Replace all whitespace characters in the input text by spaces
44        after tab expansion.  Note that if expand_tabs is false and
45        replace_whitespace is true, every tab will be converted to a
46        single space!
47      fix_sentence_endings (default: false)
48        Ensure that sentence-ending punctuation is always followed
49        by two spaces.  Off by default because the algorithm is
50        (unavoidably) imperfect.
51      break_long_words (default: true)
52        Break words longer than 'width'.  If false, those words will not
53        be broken, and some lines might be longer than 'width'.
54      break_on_hyphens (default: true)
55        Allow breaking hyphenated words. If true, wrapping will occur
56        preferably on whitespaces and right after hyphens part of
57        compound words.
58      drop_whitespace (default: true)
59        Drop leading and trailing whitespace from lines.
60      max_lines (default: None)
61        Truncate wrapped lines.
62      placeholder (default: ' [...]')
63        Append to the last line of truncated text.
64    """
65
66    unicode_whitespace_trans = {}
67    uspace = ord(' ')
68    for x in _whitespace:
69        unicode_whitespace_trans[ord(x)] = uspace
70
71    # This funky little regex is just the trick for splitting
72    # text up into word-wrappable chunks.  E.g.
73    #   "Hello there -- you goof-ball, use the -b option!"
74    # splits into
75    #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
76    # (after stripping out empty strings).
77    word_punct = r'[\w!"\'&.,?]'
78    letter = r'[^\d\W]'
79    whitespace = r'[%s]' % re.escape(_whitespace)
80    nowhitespace = '[^' + whitespace[1:]
81    wordsep_re = re.compile(r'''
82        ( # any whitespace
83          %(ws)s+
84        | # em-dash between words
85          (?<=%(wp)s) -{2,} (?=\w)
86        | # word, possibly hyphenated
87          %(nws)s+? (?:
88            # hyphenated word
89              -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
90              (?= %(lt)s -? %(lt)s)
91            | # end of word
92              (?=%(ws)s|\Z)
93            | # em-dash
94              (?<=%(wp)s) (?=-{2,}\w)
95            )
96        )''' % {'wp': word_punct, 'lt': letter,
97                'ws': whitespace, 'nws': nowhitespace},
98        re.VERBOSE)
99    del word_punct, letter, nowhitespace
100
101    # This less funky little regex just split on recognized spaces. E.g.
102    #   "Hello there -- you goof-ball, use the -b option!"
103    # splits into
104    #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
105    wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
106    del whitespace
107
108    # XXX this is not locale- or charset-aware -- string.lowercase
109    # is US-ASCII only (and therefore English-only)
110    sentence_end_re = re.compile(r'[a-z]'             # lowercase letter
111                                 r'[\.\!\?]'          # sentence-ending punct.
112                                 r'[\"\']?'           # optional end-of-quote
113                                 r'\Z')               # end of chunk
114
115    def __init__(self,
116                 width=70,
117                 initial_indent="",
118                 subsequent_indent="",
119                 expand_tabs=True,
120                 replace_whitespace=True,
121                 fix_sentence_endings=False,
122                 break_long_words=True,
123                 drop_whitespace=True,
124                 break_on_hyphens=True,
125                 tabsize=8,
126                 *,
127                 max_lines=None,
128                 placeholder=' [...]'):
129        self.width = width
130        self.initial_indent = initial_indent
131        self.subsequent_indent = subsequent_indent
132        self.expand_tabs = expand_tabs
133        self.replace_whitespace = replace_whitespace
134        self.fix_sentence_endings = fix_sentence_endings
135        self.break_long_words = break_long_words
136        self.drop_whitespace = drop_whitespace
137        self.break_on_hyphens = break_on_hyphens
138        self.tabsize = tabsize
139        self.max_lines = max_lines
140        self.placeholder = placeholder
141
142
143    # -- Private methods -----------------------------------------------
144    # (possibly useful for subclasses to override)
145
146    def _munge_whitespace(self, text):
147        """_munge_whitespace(text : string) -> string
148
149        Munge whitespace in text: expand tabs and convert all other
150        whitespace characters to spaces.  Eg. " foo\\tbar\\n\\nbaz"
151        becomes " foo    bar  baz".
152        """
153        if self.expand_tabs:
154            text = text.expandtabs(self.tabsize)
155        if self.replace_whitespace:
156            text = text.translate(self.unicode_whitespace_trans)
157        return text
158
159
160    def _split(self, text):
161        """_split(text : string) -> [string]
162
163        Split the text to wrap into indivisible chunks.  Chunks are
164        not quite the same as words; see _wrap_chunks() for full
165        details.  As an example, the text
166          Look, goof-ball -- use the -b option!
167        breaks into the following chunks:
168          'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
169          'use', ' ', 'the', ' ', '-b', ' ', 'option!'
170        if break_on_hyphens is True, or in:
171          'Look,', ' ', 'goof-ball', ' ', '--', ' ',
172          'use', ' ', 'the', ' ', '-b', ' ', option!'
173        otherwise.
174        """
175        if self.break_on_hyphens is True:
176            chunks = self.wordsep_re.split(text)
177        else:
178            chunks = self.wordsep_simple_re.split(text)
179        chunks = [c for c in chunks if c]
180        return chunks
181
182    def _fix_sentence_endings(self, chunks):
183        """_fix_sentence_endings(chunks : [string])
184
185        Correct for sentence endings buried in 'chunks'.  Eg. when the
186        original text contains "... foo.\\nBar ...", munge_whitespace()
187        and split() will convert that to [..., "foo.", " ", "Bar", ...]
188        which has one too few spaces; this method simply changes the one
189        space to two.
190        """
191        i = 0
192        patsearch = self.sentence_end_re.search
193        while i < len(chunks)-1:
194            if chunks[i+1] == " " and patsearch(chunks[i]):
195                chunks[i+1] = "  "
196                i += 2
197            else:
198                i += 1
199
200    def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
201        """_handle_long_word(chunks : [string],
202                             cur_line : [string],
203                             cur_len : int, width : int)
204
205        Handle a chunk of text (most likely a word, not whitespace) that
206        is too long to fit in any line.
207        """
208        # Figure out when indent is larger than the specified width, and make
209        # sure at least one character is stripped off on every pass
210        if width < 1:
211            space_left = 1
212        else:
213            space_left = width - cur_len
214
215        # If we're allowed to break long words, then do so: put as much
216        # of the next chunk onto the current line as will fit.
217        if self.break_long_words:
218            end = space_left
219            chunk = reversed_chunks[-1]
220            if self.break_on_hyphens and len(chunk) > space_left:
221                # break after last hyphen, but only if there are
222                # non-hyphens before it
223                hyphen = chunk.rfind('-', 0, space_left)
224                if hyphen > 0 and any(c != '-' for c in chunk[:hyphen]):
225                    end = hyphen + 1
226            cur_line.append(chunk[:end])
227            reversed_chunks[-1] = chunk[end:]
228
229        # Otherwise, we have to preserve the long word intact.  Only add
230        # it to the current line if there's nothing already there --
231        # that minimizes how much we violate the width constraint.
232        elif not cur_line:
233            cur_line.append(reversed_chunks.pop())
234
235        # If we're not allowed to break long words, and there's already
236        # text on the current line, do nothing.  Next time through the
237        # main loop of _wrap_chunks(), we'll wind up here again, but
238        # cur_len will be zero, so the next line will be entirely
239        # devoted to the long word that we can't handle right now.
240
241    def _wrap_chunks(self, chunks):
242        """_wrap_chunks(chunks : [string]) -> [string]
243
244        Wrap a sequence of text chunks and return a list of lines of
245        length 'self.width' or less.  (If 'break_long_words' is false,
246        some lines may be longer than this.)  Chunks correspond roughly
247        to words and the whitespace between them: each chunk is
248        indivisible (modulo 'break_long_words'), but a line break can
249        come between any two chunks.  Chunks should not have internal
250        whitespace; ie. a chunk is either all whitespace or a "word".
251        Whitespace chunks will be removed from the beginning and end of
252        lines, but apart from that whitespace is preserved.
253        """
254        lines = []
255        if self.width <= 0:
256            raise ValueError("invalid width %r (must be > 0)" % self.width)
257        if self.max_lines is not None:
258            if self.max_lines > 1:
259                indent = self.subsequent_indent
260            else:
261                indent = self.initial_indent
262            if len(indent) + len(self.placeholder.lstrip()) > self.width:
263                raise ValueError("placeholder too large for max width")
264
265        # Arrange in reverse order so items can be efficiently popped
266        # from a stack of chucks.
267        chunks.reverse()
268
269        while chunks:
270
271            # Start the list of chunks that will make up the current line.
272            # cur_len is just the length of all the chunks in cur_line.
273            cur_line = []
274            cur_len = 0
275
276            # Figure out which static string will prefix this line.
277            if lines:
278                indent = self.subsequent_indent
279            else:
280                indent = self.initial_indent
281
282            # Maximum width for this line.
283            width = self.width - len(indent)
284
285            # First chunk on line is whitespace -- drop it, unless this
286            # is the very beginning of the text (ie. no lines started yet).
287            if self.drop_whitespace and chunks[-1].strip() == '' and lines:
288                del chunks[-1]
289
290            while chunks:
291                l = len(chunks[-1])
292
293                # Can at least squeeze this chunk onto the current line.
294                if cur_len + l <= width:
295                    cur_line.append(chunks.pop())
296                    cur_len += l
297
298                # Nope, this line is full.
299                else:
300                    break
301
302            # The current line is full, and the next chunk is too big to
303            # fit on *any* line (not just this one).
304            if chunks and len(chunks[-1]) > width:
305                self._handle_long_word(chunks, cur_line, cur_len, width)
306                cur_len = sum(map(len, cur_line))
307
308            # If the last chunk on this line is all whitespace, drop it.
309            if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
310                cur_len -= len(cur_line[-1])
311                del cur_line[-1]
312
313            if cur_line:
314                if (self.max_lines is None or
315                    len(lines) + 1 < self.max_lines or
316                    (not chunks or
317                     self.drop_whitespace and
318                     len(chunks) == 1 and
319                     not chunks[0].strip()) and cur_len <= width):
320                    # Convert current line back to a string and store it in
321                    # list of all lines (return value).
322                    lines.append(indent + ''.join(cur_line))
323                else:
324                    while cur_line:
325                        if (cur_line[-1].strip() and
326                            cur_len + len(self.placeholder) <= width):
327                            cur_line.append(self.placeholder)
328                            lines.append(indent + ''.join(cur_line))
329                            break
330                        cur_len -= len(cur_line[-1])
331                        del cur_line[-1]
332                    else:
333                        if lines:
334                            prev_line = lines[-1].rstrip()
335                            if (len(prev_line) + len(self.placeholder) <=
336                                    self.width):
337                                lines[-1] = prev_line + self.placeholder
338                                break
339                        lines.append(indent + self.placeholder.lstrip())
340                    break
341
342        return lines
343
344    def _split_chunks(self, text):
345        text = self._munge_whitespace(text)
346        return self._split(text)
347
348    # -- Public interface ----------------------------------------------
349
350    def wrap(self, text):
351        """wrap(text : string) -> [string]
352
353        Reformat the single paragraph in 'text' so it fits in lines of
354        no more than 'self.width' columns, and return a list of wrapped
355        lines.  Tabs in 'text' are expanded with string.expandtabs(),
356        and all other whitespace characters (including newline) are
357        converted to space.
358        """
359        chunks = self._split_chunks(text)
360        if self.fix_sentence_endings:
361            self._fix_sentence_endings(chunks)
362        return self._wrap_chunks(chunks)
363
364    def fill(self, text):
365        """fill(text : string) -> string
366
367        Reformat the single paragraph in 'text' to fit in lines of no
368        more than 'self.width' columns, and return a new string
369        containing the entire wrapped paragraph.
370        """
371        return "\n".join(self.wrap(text))
372
373
374# -- Convenience interface ---------------------------------------------
375
376def wrap(text, width=70, **kwargs):
377    """Wrap a single paragraph of text, returning a list of wrapped lines.
378
379    Reformat the single paragraph in 'text' so it fits in lines of no
380    more than 'width' columns, and return a list of wrapped lines.  By
381    default, tabs in 'text' are expanded with string.expandtabs(), and
382    all other whitespace characters (including newline) are converted to
383    space.  See TextWrapper class for available keyword args to customize
384    wrapping behaviour.
385    """
386    w = TextWrapper(width=width, **kwargs)
387    return w.wrap(text)
388
389def fill(text, width=70, **kwargs):
390    """Fill a single paragraph of text, returning a new string.
391
392    Reformat the single paragraph in 'text' to fit in lines of no more
393    than 'width' columns, and return a new string containing the entire
394    wrapped paragraph.  As with wrap(), tabs are expanded and other
395    whitespace characters converted to space.  See TextWrapper class for
396    available keyword args to customize wrapping behaviour.
397    """
398    w = TextWrapper(width=width, **kwargs)
399    return w.fill(text)
400
401def shorten(text, width, **kwargs):
402    """Collapse and truncate the given text to fit in the given width.
403
404    The text first has its whitespace collapsed.  If it then fits in
405    the *width*, it is returned as is.  Otherwise, as many words
406    as possible are joined and then the placeholder is appended::
407
408        >>> textwrap.shorten("Hello  world!", width=12)
409        'Hello world!'
410        >>> textwrap.shorten("Hello  world!", width=11)
411        'Hello [...]'
412    """
413    w = TextWrapper(width=width, max_lines=1, **kwargs)
414    return w.fill(' '.join(text.strip().split()))
415
416
417# -- Loosely related functionality -------------------------------------
418
419_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
420_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
421
422def dedent(text):
423    """Remove any common leading whitespace from every line in `text`.
424
425    This can be used to make triple-quoted strings line up with the left
426    edge of the display, while still presenting them in the source code
427    in indented form.
428
429    Note that tabs and spaces are both treated as whitespace, but they
430    are not equal: the lines "  hello" and "\\thello" are
431    considered to have no common leading whitespace.
432
433    Entirely blank lines are normalized to a newline character.
434    """
435    # Look for the longest leading string of spaces and tabs common to
436    # all lines.
437    margin = None
438    text = _whitespace_only_re.sub('', text)
439    indents = _leading_whitespace_re.findall(text)
440    for indent in indents:
441        if margin is None:
442            margin = indent
443
444        # Current line more deeply indented than previous winner:
445        # no change (previous winner is still on top).
446        elif indent.startswith(margin):
447            pass
448
449        # Current line consistent with and no deeper than previous winner:
450        # it's the new winner.
451        elif margin.startswith(indent):
452            margin = indent
453
454        # Find the largest common whitespace between current line and previous
455        # winner.
456        else:
457            for i, (x, y) in enumerate(zip(margin, indent)):
458                if x != y:
459                    margin = margin[:i]
460                    break
461
462    # sanity check (testing/debugging only)
463    if 0 and margin:
464        for line in text.split("\n"):
465            assert not line or line.startswith(margin), \
466                   "line = %r, margin = %r" % (line, margin)
467
468    if margin:
469        text = re.sub(r'(?m)^' + margin, '', text)
470    return text
471
472
473def indent(text, prefix, predicate=None):
474    """Adds 'prefix' to the beginning of selected lines in 'text'.
475
476    If 'predicate' is provided, 'prefix' will only be added to the lines
477    where 'predicate(line)' is True. If 'predicate' is not provided,
478    it will default to adding 'prefix' to all non-empty lines that do not
479    consist solely of whitespace characters.
480    """
481    if predicate is None:
482        def predicate(line):
483            return line.strip()
484
485    def prefixed_lines():
486        for line in text.splitlines(True):
487            yield (prefix + line if predicate(line) else line)
488    return ''.join(prefixed_lines())
489
490
491if __name__ == "__main__":
492    #print dedent("\tfoo\n\tbar")
493    #print dedent("  \thello there\n  \t  how are you?")
494    print(dedent("Hello there.\n  This is indented."))
495