1 /* `L' command implementation for GNU sed, based on GNU fmt 1.22.
2 Copyright (C) 1994, 1995, 1996, 2002, 2003 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
17
18 /* GNU fmt was written by Ross Paterson <rap@doc.ic.ac.uk>. */
19
20 #include "sed.h"
21
22 #include <stdio.h>
23 #include <string.h>
24 #include <ctype.h>
25 #include <sys/types.h>
26
27 #if HAVE_LIMITS_H
28 # include <limits.h>
29 #endif
30
31 #ifndef UINT_MAX
32 # define UINT_MAX ((unsigned int) ~(unsigned int) 0)
33 #endif
34
35 #ifndef INT_MAX
36 # define INT_MAX ((int) (UINT_MAX >> 1))
37 #endif
38
39 /* The following parameters represent the program's idea of what is
40 "best". Adjust to taste, subject to the caveats given. */
41
42 /* Prefer lines to be LEEWAY % shorter than the maximum width, giving
43 room for optimization. */
44 #define LEEWAY 7
45
46 /* Costs and bonuses are expressed as the equivalent departure from the
47 optimal line length, multiplied by 10. e.g. assigning something a
48 cost of 50 means that it is as bad as a line 5 characters too short
49 or too long. The definition of SHORT_COST(n) should not be changed.
50 However, EQUIV(n) may need tuning. */
51
52 typedef long COST;
53
54 #define MAXCOST (~(((unsigned long) 1) << (8 * sizeof (COST) -1)))
55
56 #define SQR(n) ((n) * (n))
57 #define EQUIV(n) SQR ((COST) (n))
58
59 /* Cost of a filled line n chars longer or shorter than best_width. */
60 #define SHORT_COST(n) EQUIV ((n) * 10)
61
62 /* Cost of the difference between adjacent filled lines. */
63 #define RAGGED_COST(n) (SHORT_COST (n) / 2)
64
65 /* Basic cost per line. */
66 #define LINE_COST EQUIV (70)
67
68 /* Cost of breaking a line after the first word of a sentence, where
69 the length of the word is N. */
70 #define WIDOW_COST(n) (EQUIV (200) / ((n) + 2))
71
72 /* Cost of breaking a line before the last word of a sentence, where
73 the length of the word is N. */
74 #define ORPHAN_COST(n) (EQUIV (150) / ((n) + 2))
75
76 /* Bonus for breaking a line at the end of a sentence. */
77 #define SENTENCE_BONUS EQUIV (50)
78
79 /* Cost of breaking a line after a period not marking end of a sentence.
80 With the definition of sentence we are using (borrowed from emacs, see
81 get_line()) such a break would then look like a sentence break. Hence
82 we assign a very high cost -- it should be avoided unless things are
83 really bad. */
84 #define NOBREAK_COST EQUIV (600)
85
86 /* Bonus for breaking a line before open parenthesis. */
87 #define PAREN_BONUS EQUIV (40)
88
89 /* Bonus for breaking a line after other punctuation. */
90 #define PUNCT_BONUS EQUIV(40)
91
92 /* Credit for breaking a long paragraph one line later. */
93 #define LINE_CREDIT EQUIV(3)
94
95 /* Size of paragraph buffer in words. Longer paragraphs are handled
96 neatly (cf. flush_paragraph()), so there's little to gain by making
97 these larger. */
98 #define MAXWORDS 1000
99
100 #define GETC() (parabuf == end_of_parabuf ? EOF : *parabuf++)
101
102 /* Extra ctype(3)-style macros. */
103
104 #define isopen(c) (strchr ("([`'\"", (c)) != NULL)
105 #define isclose(c) (strchr (")]'\"", (c)) != NULL)
106 #define isperiod(c) (strchr (".?!", (c)) != NULL)
107
108 /* Size of a tab stop, for expansion on input and re-introduction on
109 output. */
110 #define TABWIDTH 8
111
112 /* Word descriptor structure. */
113
114 typedef struct Word WORD;
115
116 struct Word
117 {
118
119 /* Static attributes determined during input. */
120
121 const char *text; /* the text of the word */
122 short length; /* length of this word */
123 short space; /* the size of the following space */
124 unsigned paren:1; /* starts with open paren */
125 unsigned period:1; /* ends in [.?!])* */
126 unsigned punct:1; /* ends in punctuation */
127 unsigned final:1; /* end of sentence */
128
129 /* The remaining fields are computed during the optimization. */
130
131 short line_length; /* length of the best line starting here */
132 COST best_cost; /* cost of best paragraph starting here */
133 WORD *next_break; /* break which achieves best_cost */
134 };
135
136 /* Forward declarations. */
137
138 static bool get_paragraph P_ ((void));
139 static int get_line P_ ((int c));
140 static int get_space P_ ((int c));
141 static int copy_rest P_ ((int c));
142 static bool same_para P_ ((int c));
143 static void flush_paragraph P_ ((void));
144 static void fmt_paragraph P_ ((void));
145 static void check_punctuation P_ ((WORD *w));
146 static COST base_cost P_ ((WORD *this));
147 static COST line_cost P_ ((WORD *next, int len));
148 static void put_paragraph P_ ((WORD *finish));
149 static void put_line P_ ((WORD *w, int indent));
150 static void put_word P_ ((WORD *w));
151 static void put_space P_ ((int space));
152
153 /* Option values. */
154
155 /* User-supplied maximum line width (default WIDTH). The only output
156 lines
157 longer than this will each comprise a single word. */
158 static int max_width;
159
160 /* Space for the paragraph text. */
161 static const char *parabuf;
162
163 /* End of space for the paragraph text. */
164 static const char *end_of_parabuf;
165
166 /* The file on which we output */
167 static FILE *outfile;
168
169 /* Values derived from the option values. */
170
171 /* The preferred width of text lines, set to LEEWAY % less than max_width. */
172 static int best_width;
173
174 /* Dynamic variables. */
175
176 /* Start column of the character most recently read from the input file. */
177 static int in_column;
178
179 /* Start column of the next character to be written to stdout. */
180 static int out_column;
181
182 /* The words of a paragraph -- longer paragraphs are handled neatly
183 (cf. flush_paragraph()). */
184 static WORD words[MAXWORDS];
185
186 /* A pointer into the above word array, indicating the first position
187 after the last complete word. Sometimes it will point at an incomplete
188 word. */
189 static WORD *word_limit;
190
191 /* Indentation of the first line of the current paragraph. */
192 static int first_indent;
193
194 /* Indentation of other lines of the current paragraph */
195 static int other_indent;
196
197 /* The last character read from the input file. */
198 static int next_char;
199
200 /* If nonzero, the length of the last line output in the current
201 paragraph, used to charge for raggedness at the split point for long
202 paragraphs chosen by fmt_paragraph(). */
203 static int last_line_length;
204
205 /* read file F and send formatted output to stdout. */
206
207 void
fmt(const char * line,const char * line_end,int max_length,FILE * output_file)208 fmt (const char *line, const char *line_end, int max_length, FILE *output_file)
209 {
210 parabuf = line;
211 end_of_parabuf = line_end;
212 outfile = output_file;
213
214 max_width = max_length;
215 best_width = max_width * (201 - 2 * LEEWAY) / 200;
216
217 in_column = 0;
218 other_indent = 0;
219 next_char = GETC();
220 while (get_paragraph ())
221 {
222 fmt_paragraph ();
223 put_paragraph (word_limit);
224 }
225 }
226
227 /* Read a paragraph from input file F. A paragraph consists of a
228 maximal number of non-blank (excluding any prefix) lines
229 with the same indent.
230
231 Return false if end-of-file was encountered before the start of a
232 paragraph, else true. */
233
234 static bool
get_paragraph()235 get_paragraph ()
236 {
237 register int c;
238
239 last_line_length = 0;
240 c = next_char;
241
242 /* Scan (and copy) blank lines, and lines not introduced by the prefix. */
243
244 while (c == '\n' || c == EOF)
245 {
246 c = copy_rest (c);
247 if (c == EOF)
248 {
249 next_char = EOF;
250 return false;
251 }
252 putc ('\n', outfile);
253 c = GETC();
254 }
255
256 /* Got a suitable first line for a paragraph. */
257
258 first_indent = in_column;
259 word_limit = words;
260 c = get_line (c);
261
262 /* Read rest of paragraph. */
263
264 other_indent = in_column;
265 while (same_para (c) && in_column == other_indent)
266 c = get_line (c);
267
268 (word_limit - 1)->period = (word_limit - 1)->final = true;
269 next_char = c;
270 return true;
271 }
272
273 /* Copy to the output a blank line. In the latter, C is \n or EOF.
274 Return the character (\n or EOF) ending the line. */
275
276 static int
copy_rest(register int c)277 copy_rest (register int c)
278 {
279 out_column = 0;
280 while (c != '\n' && c != EOF)
281 {
282 putc (c, outfile);
283 c = GETC();
284 }
285 return c;
286 }
287
288 /* Return true if a line whose first non-blank character after the
289 prefix (if any) is C could belong to the current paragraph,
290 otherwise false. */
291
292 static bool
same_para(register int c)293 same_para (register int c)
294 {
295 return (c != '\n' && c != EOF);
296 }
297
298 /* Read a line from the input data given first non-blank character C
299 after the prefix, and the following indent, and break it into words.
300 A word is a maximal non-empty string of non-white characters. A word
301 ending in [.?!]["')\]]* and followed by end-of-line or at least two
302 spaces ends a sentence, as in emacs.
303
304 Return the first non-blank character of the next line. */
305
306 static int
get_line(register int c)307 get_line (register int c)
308 {
309 int start;
310 register WORD *end_of_word;
311
312 end_of_word = &words[MAXWORDS - 2];
313
314 do
315 { /* for each word in a line */
316
317 /* Scan word. */
318
319 word_limit->text = parabuf - 1;
320 do
321 c = GETC();
322 while (c != EOF && !ISSPACE (c));
323 word_limit->length = parabuf - word_limit->text - (c != EOF);
324 in_column += word_limit->length;
325
326 check_punctuation (word_limit);
327
328 /* Scan inter-word space. */
329
330 start = in_column;
331 c = get_space (c);
332 word_limit->space = in_column - start;
333 word_limit->final = (c == EOF
334 || (word_limit->period
335 && (c == '\n' || word_limit->space > 1)));
336 if (c == '\n' || c == EOF)
337 word_limit->space = word_limit->final ? 2 : 1;
338 if (word_limit == end_of_word)
339 flush_paragraph ();
340 word_limit++;
341 if (c == EOF)
342 {
343 in_column = first_indent;
344 return EOF;
345 }
346 }
347 while (c != '\n');
348
349 in_column = 0;
350 c = GETC();
351 return get_space (c);
352 }
353
354 /* Read blank characters from the input data, starting with C, and keeping
355 in_column up-to-date. Return first non-blank character. */
356
357 static int
get_space(register int c)358 get_space (register int c)
359 {
360 for (;;)
361 {
362 if (c == ' ')
363 in_column++;
364 else if (c == '\t')
365 in_column = (in_column / TABWIDTH + 1) * TABWIDTH;
366 else
367 return c;
368 c = GETC();
369 }
370 }
371
372 /* Set extra fields in word W describing any attached punctuation. */
373
374 static void
check_punctuation(register WORD * w)375 check_punctuation (register WORD *w)
376 {
377 register const char *start, *finish;
378
379 start = w->text;
380 finish = start + (w->length - 1);
381 w->paren = isopen (*start);
382 w->punct = ISPUNCT (*finish);
383 while (isclose (*finish) && finish > start)
384 finish--;
385 w->period = isperiod (*finish);
386 }
387
388 /* Flush part of the paragraph to make room. This function is called on
389 hitting the limit on the number of words or characters. */
390
391 static void
flush_paragraph(void)392 flush_paragraph (void)
393 {
394 WORD *split_point;
395 register WORD *w;
396 COST best_break;
397
398 /* - format what you have so far as a paragraph,
399 - find a low-cost line break near the end,
400 - output to there,
401 - make that the start of the paragraph. */
402
403 fmt_paragraph ();
404
405 /* Choose a good split point. */
406
407 split_point = word_limit;
408 best_break = MAXCOST;
409 for (w = words->next_break; w != word_limit; w = w->next_break)
410 {
411 if (w->best_cost - w->next_break->best_cost < best_break)
412 {
413 split_point = w;
414 best_break = w->best_cost - w->next_break->best_cost;
415 }
416 if (best_break <= MAXCOST - LINE_CREDIT)
417 best_break += LINE_CREDIT;
418 }
419 put_paragraph (split_point);
420
421 /* Copy words from split_point down to word -- we use memmove because
422 the source and target may overlap. */
423
424 memmove ((char *) words, (char *) split_point,
425 (word_limit - split_point + 1) * sizeof (WORD));
426 word_limit -= split_point - words;
427 }
428
429 /* Compute the optimal formatting for the whole paragraph by computing
430 and remembering the optimal formatting for each suffix from the empty
431 one to the whole paragraph. */
432
433 static void
fmt_paragraph(void)434 fmt_paragraph (void)
435 {
436 register WORD *start, *w;
437 register int len;
438 register COST wcost, best;
439 int saved_length;
440
441 word_limit->best_cost = 0;
442 saved_length = word_limit->length;
443 word_limit->length = max_width; /* sentinel */
444
445 for (start = word_limit - 1; start >= words; start--)
446 {
447 best = MAXCOST;
448 len = start == words ? first_indent : other_indent;
449
450 /* At least one word, however long, in the line. */
451
452 w = start;
453 len += w->length;
454 do
455 {
456 w++;
457
458 /* Consider breaking before w. */
459
460 wcost = line_cost (w, len) + w->best_cost;
461 if (start == words && last_line_length > 0)
462 wcost += RAGGED_COST (len - last_line_length);
463 if (wcost < best)
464 {
465 best = wcost;
466 start->next_break = w;
467 start->line_length = len;
468 }
469 len += (w - 1)->space + w->length; /* w > start >= words */
470 }
471 while (len < max_width);
472 start->best_cost = best + base_cost (start);
473 }
474
475 word_limit->length = saved_length;
476 }
477
478 /* Return the constant component of the cost of breaking before the
479 word THIS. */
480
481 static COST
base_cost(register WORD * this)482 base_cost (register WORD *this)
483 {
484 register COST cost;
485
486 cost = LINE_COST;
487
488 if (this > words)
489 {
490 if ((this - 1)->period)
491 {
492 if ((this - 1)->final)
493 cost -= SENTENCE_BONUS;
494 else
495 cost += NOBREAK_COST;
496 }
497 else if ((this - 1)->punct)
498 cost -= PUNCT_BONUS;
499 else if (this > words + 1 && (this - 2)->final)
500 cost += WIDOW_COST ((this - 1)->length);
501 }
502
503 if (this->paren)
504 cost -= PAREN_BONUS;
505 else if (this->final)
506 cost += ORPHAN_COST (this->length);
507
508 return cost;
509 }
510
511 /* Return the component of the cost of breaking before word NEXT that
512 depends on LEN, the length of the line beginning there. */
513
514 static COST
line_cost(register WORD * next,register int len)515 line_cost (register WORD *next, register int len)
516 {
517 register int n;
518 register COST cost;
519
520 if (next == word_limit)
521 return 0;
522 n = best_width - len;
523 cost = SHORT_COST (n);
524 if (next->next_break != word_limit)
525 {
526 n = len - next->line_length;
527 cost += RAGGED_COST (n);
528 }
529 return cost;
530 }
531
532 /* Output to stdout a paragraph from word up to (but not including)
533 FINISH, which must be in the next_break chain from word. */
534
535 static void
put_paragraph(register WORD * finish)536 put_paragraph (register WORD *finish)
537 {
538 register WORD *w;
539
540 put_line (words, first_indent);
541 for (w = words->next_break; w != finish; w = w->next_break)
542 put_line (w, other_indent);
543 }
544
545 /* Output to stdout the line beginning with word W, beginning in column
546 INDENT, including the prefix (if any). */
547
548 static void
put_line(register WORD * w,int indent)549 put_line (register WORD *w, int indent)
550 {
551 register WORD *endline;
552 out_column = 0;
553 put_space (indent);
554
555 endline = w->next_break - 1;
556 for (; w != endline; w++)
557 {
558 put_word (w);
559 put_space (w->space);
560 }
561 put_word (w);
562 last_line_length = out_column;
563 putc ('\n', outfile);
564 }
565
566 /* Output to stdout the word W. */
567
568 static void
put_word(register WORD * w)569 put_word (register WORD *w)
570 {
571 register const char *s;
572 register int n;
573
574 s = w->text;
575 for (n = w->length; n != 0; n--)
576 putc (*s++, outfile);
577 out_column += w->length;
578 }
579
580 /* Output to stdout SPACE spaces, or equivalent tabs. */
581
582 static void
put_space(int space)583 put_space (int space)
584 {
585 out_column += space;
586 while (space--)
587 putc (' ', outfile);
588 }
589