• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2  *
3  * Copyright 2014 Rob Landley <rob@landley.net>
4  *
5  * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6  *
7  * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8  * but N and s///
9  * TODO: make y// handle unicode, unicode delimiters
10  * TODO: handle error return from emit(), error_msg/exit consistently
11  *       What's the right thing to do for -i when write fails? Skip to next?
12  * test '//q' with no previous regex, also repeat previous regex?
13 
14 USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
15 
16 config SED
17   bool "sed"
18   default y
19   help
20     usage: sed [-inrzE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
21 
22     Stream editor. Apply one or more editing SCRIPTs to each line of input
23     (from FILE or stdin) producing output (by default to stdout).
24 
25     -e	Add SCRIPT to list
26     -f	Add contents of SCRIPT_FILE to list
27     -i	Edit each file in place (-iEXT keeps backup file with extension EXT)
28     -n	No default output (use the p command to output matched lines)
29     -r	Use extended regular expression syntax
30     -E	POSIX alias for -r
31     -s	Treat input files separately (implied by -i)
32     -z	Use \0 rather than \n as the input line separator
33 
34     A SCRIPT is a series of one or more COMMANDs separated by newlines or
35     semicolons. All -e SCRIPTs are concatenated together as if separated
36     by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
37     If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
38 
39     Each COMMAND may be preceded by an address which limits the command to
40     apply only to the specified line(s). Commands without an address apply to
41     every line. Addresses are of the form:
42 
43       [ADDRESS[,ADDRESS]][!]COMMAND
44 
45     The ADDRESS may be a decimal line number (starting at 1), a /regular
46     expression/ within a pair of forward slashes, or the character "$" which
47     matches the last line of input. (In -s or -i mode this matches the last
48     line of each file, otherwise just the last line of the last file.) A single
49     address matches one line, a pair of comma separated addresses match
50     everything from the first address to the second address (inclusive). If
51     both addresses are regular expressions, more than one range of lines in
52     each file can match. The second address can be +N to end N lines later.
53 
54     REGULAR EXPRESSIONS in sed are started and ended by the same character
55     (traditionally / but anything except a backslash or a newline works).
56     Backslashes may be used to escape the delimiter if it occurs in the
57     regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
58     and unicode). An empty regex repeats the previous one. ADDRESS regexes
59     (above) require the first delimiter to be escaped with a backslash when
60     it isn't a forward slash (to distinguish it from the COMMANDs below).
61 
62     Sed mostly operates on individual lines one at a time. It reads each line,
63     processes it, and either writes it to the output or discards it before
64     reading the next line. Sed can remember one additional line in a separate
65     buffer (using the h, H, g, G, and x commands), and can read the next line
66     of input early (using the n and N command), but other than that command
67     scripts operate on individual lines of text.
68 
69     Each COMMAND starts with a single character. The following commands take
70     no arguments:
71 
72       !  Run this command when the test _didn't_ match.
73 
74       {  Start a new command block, continuing until a corresponding "}".
75          Command blocks may nest. If the block has an address, commands within
76          the block are only run for lines within the block's address range.
77 
78       }  End command block (this command cannot have an address)
79 
80       d  Delete this line and move on to the next one
81          (ignores remaining COMMANDs)
82 
83       D  Delete one line of input and restart command SCRIPT (same as "d"
84          unless you've glued lines together with "N" or similar)
85 
86       g  Get remembered line (overwriting current line)
87 
88       G  Get remembered line (appending to current line)
89 
90       h  Remember this line (overwriting remembered line)
91 
92       H  Remember this line (appending to remembered line, if any)
93 
94       l  Print line, escaping \abfrtv (but not newline), octal escaping other
95          nonprintable characters, wrapping lines to terminal width with a
96          backslash, and appending $ to actual end of line.
97 
98       n  Print default output and read next line, replacing current line
99          (If no next line available, quit processing script)
100 
101       N  Append next line of input to this line, separated by a newline
102          (This advances the line counter for address matching and "=", if no
103          next line available quit processing script without default output)
104 
105       p  Print this line
106 
107       P  Print this line up to first newline (from "N")
108 
109       q  Quit (print default output, no more commands processed or lines read)
110 
111       x  Exchange this line with remembered line (overwrite in both directions)
112 
113       =  Print the current line number (followed by a newline)
114 
115     The following commands (may) take an argument. The "text" arguments (to
116     the "a", "b", and "c" commands) may end with an unescaped "\" to append
117     the next line (for which leading whitespace is not skipped), and also
118     treat ";" as a literal character (use "\;" instead).
119 
120       a [text]   Append text to output before attempting to read next line
121 
122       b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
123 
124       c [text]   Delete line, output text at end of matching address range
125                  (ignores remaining COMMANDs)
126 
127       i [text]   Print text
128 
129       r [file]   Append contents of file to output before attempting to read
130                  next line.
131 
132       s/S/R/F    Search for regex S, replace matched text with R using flags F.
133                  The first character after the "s" (anything but newline or
134                  backslash) is the delimiter, escape with \ to use normally.
135 
136                  The replacement text may contain "&" to substitute the matched
137                  text (escape it with backslash for a literal &), or \1 through
138                  \9 to substitute a parenthetical subexpression in the regex.
139                  You can also use the normal backslash escapes such as \n and
140                  a backslash at the end of the line appends the next line.
141 
142                  The flags are:
143 
144                  [0-9]    A number, substitute only that occurrence of pattern
145                  g        Global, substitute all occurrences of pattern
146                  i        Ignore case when matching
147                  p        Print the line if match was found and replaced
148                  w [file] Write (append) line to file if match replaced
149 
150       t [label]  Test, jump to :label only if an "s" command found a match in
151                  this line since last test (replacing with same text counts)
152 
153       T [label]  Test false, jump only if "s" hasn't found a match.
154 
155       w [file]   Write (append) line to file
156 
157       y/old/new/ Change each character in 'old' to corresponding character
158                  in 'new' (with standard backslash escapes, delimiter can be
159                  any repeated character except \ or \n)
160 
161       : [label]  Labeled target for jump commands
162 
163       #  Comment, ignore rest of this line of SCRIPT
164 
165     Deviations from POSIX: allow extended regular expressions with -r,
166     editing in place with -i, separate with -s, NUL-separated input with -z,
167     printf escapes in text, line continuations, semicolons after all commands,
168     2-address anywhere an address is allowed, "T" command, multiline
169     continuations for [abc], \; to end [abc] argument before end of line.
170 */
171 
172 #define FOR_sed
173 #include "toys.h"
174 
175 GLOBALS(
176   char *i;
177   struct arg_list *f, *e;
178 
179   // processed pattern list
180   struct double_list *pattern;
181 
182   char *nextline, *remember;
183   void *restart, *lastregex;
184   long nextlen, rememberlen, count;
185   int fdout, noeol;
186   unsigned xx;
187   char delim;
188 )
189 
190 // Linked list of parsed sed commands. Offset fields indicate location where
191 // regex or string starts, ala offset+(char *)struct, because we remalloc()
192 // these to expand them for multiline inputs, and pointers would have to be
193 // individually adjusted.
194 
195 struct sedcmd {
196   struct sedcmd *next, *prev;
197 
198   // Begin and end of each match
199   long lmatch[2]; // line number of match
200   int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
201   int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
202   unsigned not, hit;
203   unsigned sflags; // s///flag bits: i=1, g=2, p=4
204   char c; // action
205 };
206 
207 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)208 static int emit(char *line, long len, int eol)
209 {
210   int l, old = line[len];
211 
212   if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
213   TT.noeol = !eol;
214   if (eol) line[len++] = '\n';
215   if (!len) return 0;
216   l = writeall(TT.fdout, line, len);
217   if (eol) line[len-1] = old;
218   if (l != len) {
219     if (TT.fdout != 1) perror_msg("short write");
220 
221     return 1;
222   }
223 
224   return 0;
225 }
226 
227 // Extend allocation to include new string, with newline between if newlen<0
228 
extend_string(char ** old,char * new,int oldlen,int newlen)229 static char *extend_string(char **old, char *new, int oldlen, int newlen)
230 {
231   int newline = newlen < 0;
232   char *s;
233 
234   if (newline) newlen = -newlen;
235   s = *old = xrealloc(*old, oldlen+newlen+newline+1);
236   if (newline) s[oldlen++] = '\n';
237   memcpy(s+oldlen, new, newlen);
238   s[oldlen+newlen] = 0;
239 
240   return s+oldlen+newlen+1;
241 }
242 
243 // An empty regex repeats the previous one
get_regex(void * command,int offset)244 static void *get_regex(void *command, int offset)
245 {
246   if (!offset) {
247     if (!TT.lastregex) error_exit("no previous regex");
248     return TT.lastregex;
249   }
250 
251   return TT.lastregex = offset+(char *)command;
252 }
253 
254 // Apply pattern to line from input file
sed_line(char ** pline,long plen)255 static void sed_line(char **pline, long plen)
256 {
257   struct append {
258     struct append *next, *prev;
259     int file;
260     char *str;
261   } *append = 0;
262   char *line = TT.nextline;
263   long len = TT.nextlen;
264   struct sedcmd *command;
265   int eol = 0, tea = 0;
266 
267   // Ignore EOF for all files before last unless -i
268   if (!pline && !FLAG(i)) return;
269 
270   // Grab next line for deferred processing (EOF detection: we get a NULL
271   // pline at EOF to flush last line). Note that only end of _last_ input
272   // file matches $ (unless we're doing -i).
273   TT.nextline = 0;
274   TT.nextlen = 0;
275   if (pline) {
276     TT.nextline = *pline;
277     TT.nextlen = plen;
278     *pline = 0;
279   }
280 
281   if (!line || !len) return;
282   if (line[len-1] == '\n') line[--len] = eol++;
283   TT.count++;
284 
285   // The restart-1 is because we added one to make sure it wasn't NULL,
286   // otherwise N as last command would restart script
287   command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
288   TT.restart = 0;
289 
290   while (command) {
291     char *str, c = command->c;
292 
293     // Have we got a line or regex matching range for this rule?
294     if (*command->lmatch || *command->rmatch) {
295       int miss = 0;
296       long lm;
297 
298       // In a match that might end?
299       if (command->hit) {
300         if (!(lm = command->lmatch[1])) {
301           if (!command->rmatch[1]) command->hit = 0;
302           else {
303             void *rm = get_regex(command, command->rmatch[1]);
304 
305             // regex match end includes matching line, so defer deactivation
306             if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
307           }
308         } else if (lm > 0 && lm < TT.count) command->hit = 0;
309         else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0;
310 
311       // Start a new match?
312       } else {
313         if (!(lm = *command->lmatch)) {
314           void *rm = get_regex(command, *command->rmatch);
315 
316           if (line && !regexec0(rm, line, len, 0, 0, 0))
317             command->hit = TT.count;
318         } else if (lm == TT.count || (lm == -1 && !pline))
319           command->hit = TT.count;
320 
321         if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
322       }
323 
324       // Didn't match?
325       lm = !(command->not^!!command->hit);
326 
327       // Deferred disable from regex end match
328       if (miss || command->lmatch[1] == TT.count) command->hit = 0;
329 
330       if (lm) {
331         // Handle skipping curly bracket command group
332         if (c == '{') {
333           int curly = 1;
334 
335           while (curly) {
336             command = command->next;
337             if (command->c == '{') curly++;
338             if (command->c == '}') curly--;
339           }
340         }
341         command = command->next;
342         continue;
343       }
344     }
345 
346     // A deleted line can still update line match state for later commands
347     if (!line) {
348       command = command->next;
349       continue;
350     }
351 
352     // Process command
353 
354     if (c=='a' || c=='r') {
355       struct append *a = xzalloc(sizeof(struct append));
356       if (command->arg1) a->str = command->arg1+(char *)command;
357       a->file = c=='r';
358       dlist_add_nomalloc((void *)&append, (void *)a);
359     } else if (c=='b' || c=='t' || c=='T') {
360       int t = tea;
361 
362       if (c != 'b') tea = 0;
363       if (c=='b' || t^(c=='T')) {
364         if (!command->arg1) break;
365         str = command->arg1+(char *)command;
366         for (command = (void *)TT.pattern; command; command = command->next)
367           if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
368             break;
369         if (!command) error_exit("no :%s", str);
370       }
371     } else if (c=='c') {
372       str = command->arg1+(char *)command;
373       if (!command->hit) emit(str, strlen(str), 1);
374       free(line);
375       line = 0;
376       continue;
377     } else if (c=='d') {
378       free(line);
379       line = 0;
380       continue;
381     } else if (c=='D') {
382       // Delete up to \n or end of buffer
383       str = line;
384       while ((str-line)<len) if (*(str++) == '\n') break;
385       len -= str - line;
386       memmove(line, str, len);
387 
388       // if "delete" blanks line, disable further processing
389       // otherwise trim and restart script
390       if (!len) {
391         free(line);
392         line = 0;
393       } else {
394         line[len] = 0;
395         command = (void *)TT.pattern;
396       }
397       continue;
398     } else if (c=='g') {
399       free(line);
400       line = xstrdup(TT.remember);
401       len = TT.rememberlen;
402     } else if (c=='G') {
403       line = xrealloc(line, len+TT.rememberlen+2);
404       line[len++] = '\n';
405       memcpy(line+len, TT.remember, TT.rememberlen);
406       line[len += TT.rememberlen] = 0;
407     } else if (c=='h') {
408       free(TT.remember);
409       TT.remember = xstrdup(line);
410       TT.rememberlen = len;
411     } else if (c=='H') {
412       TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
413       TT.remember[TT.rememberlen++] = '\n';
414       memcpy(TT.remember+TT.rememberlen, line, len);
415       TT.remember[TT.rememberlen += len] = 0;
416     } else if (c=='i') {
417       str = command->arg1+(char *)command;
418       emit(str, strlen(str), 1);
419     } else if (c=='l') {
420       int i, x, off;
421 
422       if (!TT.xx) {
423         terminal_size(&TT.xx, 0);
424         if (!TT.xx) TT.xx = 80;
425         if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
426         if (TT.xx > 4) TT.xx -= 4;
427       }
428 
429       for (i = off = 0; i<len; i++) {
430         if (off >= TT.xx) {
431           toybuf[off++] = '\\';
432           emit(toybuf, off, 1);
433           off = 0;
434         }
435         x = stridx("\\\a\b\f\r\t\v", line[i]);
436         if (x != -1) {
437           toybuf[off++] = '\\';
438           toybuf[off++] = "\\abfrtv"[x];
439         } else if (line[i] >= ' ') toybuf[off++] = line[i];
440         else off += sprintf(toybuf+off, "\\%03o", line[i]);
441       }
442       toybuf[off++] = '$';
443       emit(toybuf, off, 1);
444     } else if (c=='n') {
445       TT.restart = command->next+1;
446 
447       break;
448     } else if (c=='N') {
449       // Can't just grab next line because we could have multiple N and
450       // we need to actually read ahead to get N;$p EOF detection right.
451       if (pline) {
452         TT.restart = command->next+1;
453         extend_string(&line, TT.nextline, len, -TT.nextlen);
454         free(TT.nextline);
455         TT.nextline = line;
456         TT.nextlen += len + 1;
457         line = 0;
458       }
459 
460       // Pending append goes out right after N
461       goto done;
462     } else if (c=='p' || c=='P') {
463       char *l = (c=='P') ? strchr(line, '\n') : 0;
464 
465       if (emit(line, l ? l-line : len, eol)) break;
466     } else if (c=='q' || c=='Q') {
467       if (pline) *pline = (void *)1;
468       free(TT.nextline);
469       if (!toys.exitval && command->arg1)
470         toys.exitval = atoi(command->arg1+(char *)command);
471       TT.nextline = 0;
472       TT.nextlen = 0;
473       if (c=='Q') line = 0;
474 
475       break;
476     } else if (c=='s') {
477       char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0;
478       regmatch_t *match = (void *)toybuf;
479       regex_t *reg = get_regex(command, command->arg1);
480       int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0,
481         mlen, off, newlen;
482 
483       // Loop finding match in remaining line (up to remaining len)
484       while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) {
485         mflags = REG_NOTBOL;
486 
487         // Zero length matches don't count immediately after a previous match
488         mlen = match[0].rm_eo-match[0].rm_so;
489         if (!mlen && !zmatch) {
490           if (rline-line == len) break;
491           l2[l2used++] = *rline++;
492           zmatch++;
493           continue;
494         } else zmatch = 0;
495 
496         // If we're replacing only a specific match, skip if this isn't it
497         off = command->sflags>>3;
498         if (off && off != ++count) {
499           memcpy(l2+l2used, rline, match[0].rm_eo);
500           l2used += match[0].rm_eo;
501           rline += match[0].rm_eo;
502 
503           continue;
504         }
505         // The fact getline() can allocate unbounded amounts of memory is
506         // a bigger issue, but while we're here check for integer overflow
507         if (match[0].rm_eo > INT_MAX) perror_exit(0);
508 
509         // newlen = strlen(new) but with \1 and & and printf escapes
510         for (off = newlen = 0; new[off]; off++) {
511           int cc = -1;
512 
513           if (new[off] == '&') cc = 0;
514           else if (new[off] == '\\') cc = new[++off] - '0';
515           if (cc < 0 || cc > 9) {
516             newlen++;
517             continue;
518           }
519           newlen += match[cc].rm_eo-match[cc].rm_so;
520         }
521 
522         // Copy changed data to new string
523 
524         // Adjust allocation size of new string, copy data we know we'll keep
525         l2l += newlen-mlen;
526         if ((l2l|0xfff) > l2old) l2 = xrealloc(l2, l2old = (l2l|0xfff)+1);
527         if (match[0].rm_so) {
528           memcpy(l2+l2used, rline, match[0].rm_so);
529           l2used += match[0].rm_so;
530         }
531 
532         // copy in new replacement text
533         for (off = mlen = 0; new[off]; off++) {
534           int cc = 0, ll;
535 
536           if (new[off] == '\\') {
537             cc = new[++off] - '0';
538             if (cc<0 || cc>9) {
539               if (!(l2[l2used+mlen++] = unescape(new[off])))
540                 l2[l2used+mlen-1] = new[off];
541 
542               continue;
543             } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
544           } else if (new[off] != '&') {
545             l2[l2used+mlen++] = new[off];
546 
547             continue;
548           }
549 
550           if (match[cc].rm_so != -1) {
551             ll = match[cc].rm_eo-match[cc].rm_so;
552             memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll);
553             mlen += ll;
554           }
555         }
556         l2used += newlen;
557         rline += match[0].rm_eo;
558 
559         // Stop after first substitution unless we have flag g
560         if (!(command->sflags & 2)) break;
561       }
562 
563       // If we made any changes, finish off l2 and swap it for line
564       if (l2) {
565         // grab trailing unmatched data and null terminator, swap with original
566         mlen = len-(rline-line);
567         memcpy(l2+l2used, rline, mlen+1);
568         len = l2used + mlen;
569         free(line);
570         line = l2;
571       }
572 
573       if (mflags) {
574         // flag p
575         if (command->sflags & 4) emit(line, len, eol);
576 
577         tea = 1;
578         if (command->w) goto writenow;
579       }
580     } else if (c=='w') {
581       int fd, noeol;
582       char *name;
583 
584 writenow:
585       // Swap out emit() context
586       fd = TT.fdout;
587       noeol = TT.noeol;
588 
589       // We save filehandle and newline status before filename
590       name = command->w + (char *)command;
591       memcpy(&TT.fdout, name, 4);
592       name += 4;
593       TT.noeol = *(name++);
594 
595       // write, then save/restore context
596       if (emit(line, len, eol))
597         perror_exit("w '%s'", command->arg1+(char *)command);
598       *(--name) = TT.noeol;
599       TT.noeol = noeol;
600       TT.fdout = fd;
601     } else if (c=='x') {
602       long swap = TT.rememberlen;
603 
604       str = TT.remember;
605       TT.remember = line;
606       line = str;
607       TT.rememberlen = len;
608       len = swap;
609     } else if (c=='y') {
610       char *from, *to = (char *)command;
611       int i, j;
612 
613       from = to+command->arg1;
614       to += command->arg2;
615 
616       for (i = 0; i < len; i++) {
617         j = stridx(from, line[i]);
618         if (j != -1) line[i] = to[j];
619       }
620     } else if (c=='=') {
621       sprintf(toybuf, "%ld", TT.count);
622       if (emit(toybuf, strlen(toybuf), 1)) break;
623     }
624 
625     command = command->next;
626   }
627 
628   if (line && !FLAG(n)) emit(line, len, eol);
629 
630 done:
631   if (dlist_terminate(append)) while (append) {
632     struct append *a = append->next;
633 
634     if (append->file) {
635       int fd = open(append->str, O_RDONLY);
636 
637       // Force newline if noeol pending
638       if (fd != -1) {
639         if (TT.noeol) xwrite(TT.fdout, "\n", 1);
640         TT.noeol = 0;
641         xsendfile(fd, TT.fdout);
642         close(fd);
643       }
644     } else if (append->str) emit(append->str, strlen(append->str), 1);
645     else emit(line, 0, 0);
646     free(append);
647     append = a;
648   }
649   free(line);
650 }
651 
652 // Callback called on each input file
do_sed_file(int fd,char * name)653 static void do_sed_file(int fd, char *name)
654 {
655   char *tmp;
656 
657   if (FLAG(i)) {
658     struct sedcmd *command;
659 
660     if (!fd) return error_msg("-i on stdin");
661     TT.fdout = copy_tempfile(fd, name, &tmp);
662     TT.count = 0;
663     for (command = (void *)TT.pattern; command; command = command->next)
664       command->hit = 0;
665   }
666   do_lines(fd, TT.delim, sed_line);
667   if (FLAG(i)) {
668     if (TT.i && *TT.i) {
669       char *s = xmprintf("%s%s", name, TT.i);
670 
671       xrename(name, s);
672       free(s);
673     }
674     replace_tempfile(-1, TT.fdout, &tmp);
675     TT.fdout = 1;
676     TT.nextline = 0;
677     TT.nextlen = TT.noeol = 0;
678   }
679 }
680 
681 // Copy chunk of string between two delimiters, converting printf escapes.
682 // returns processed copy of string (0 if error), *pstr advances to next
683 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
684 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim)685 static char *unescape_delimited_string(char **pstr, char *delim)
686 {
687   char *to, *from, mode = 0, d;
688 
689   // Grab leading delimiter (if necessary), allocate space for new string
690   from = *pstr;
691   if (!delim || !*delim) {
692     if (!(d = *(from++))) return 0;
693     if (d == '\\') d = *(from++);
694     if (!d || d == '\\') return 0;
695     if (delim) *delim = d;
696   } else d = *delim;
697   to = delim = xmalloc(strlen(*pstr)+1);
698 
699   while (mode || *from != d) {
700     if (!*from) return 0;
701 
702     // delimiter in regex character range doesn't count
703     if (*from == '[') {
704       if (!mode) {
705         mode = ']';
706         if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
707       } else if (mode == ']' && strchr(".=:", from[1])) {
708         *(to++) = *(from++);
709         mode = *from;
710       }
711     } else if (*from == mode) {
712       if (mode == ']') mode = 0;
713       else {
714         *(to++) = *(from++);
715         mode = ']';
716       }
717     // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
718     // but the perl build does it, so we need to filter it out.
719     } else if (mode && *from == '-' && from[-1] == from[1]) {
720       from+=2;
721       continue;
722     } else if (*from == '\\') {
723       if (!from[1]) return 0;
724 
725       // Check escaped end delimiter before printf style escapes.
726       if (from[1] == d) from++;
727       else if (from[1]=='\\') *(to++) = *(from++);
728       else {
729         char c = unescape(from[1]);
730 
731         if (c) {
732           *(to++) = c;
733           from+=2;
734           continue;
735         } else if (!mode) *(to++) = *(from++);
736       }
737     }
738     *(to++) = *(from++);
739   }
740   *to = 0;
741   *pstr = from+1;
742 
743   return delim;
744 }
745 
746 // Translate pattern strings into command structures. Each command structure
747 // is a single allocation (which requires some math and remalloc at times).
parse_pattern(char ** pline,long len)748 static void parse_pattern(char **pline, long len)
749 {
750   struct sedcmd *command = (void *)TT.pattern;
751   char *line, *reg, c, *errstart;
752   int i;
753 
754   line = errstart = pline ? *pline : "";
755   if (len && line[len-1]=='\n') line[--len] = 0;
756 
757   // Append this line to previous multiline command? (hit indicates type.)
758   // During parsing "hit" stores data about line continuations, but in
759   // sed_line() it means the match range attached to this command
760   // is active, so processing the continuation must zero it again.
761   if (command && command->prev->hit) {
762     // Remove half-finished entry from list so remalloc() doesn't confuse it
763     TT.pattern = TT.pattern->prev;
764     command = dlist_pop(&TT.pattern);
765     c = command->c;
766     reg = (char *)command;
767     reg += command->arg1 + strlen(reg + command->arg1);
768 
769     // Resume parsing for 'a' or 's' command. (Only two that can do this.)
770     // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
771     // a unicode character.
772     if (command->hit < 256) goto resume_s;
773     else goto resume_a;
774   }
775 
776   // Loop through commands in this line.
777 
778   command = 0;
779   for (;;) {
780     if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
781 
782     // If there's no more data on this line, return.
783     for (;;) {
784       while (isspace(*line) || *line == ';') line++;
785       if (*line == '#') while (*line && *line != '\n') line++;
786       else break;
787     }
788     if (!*line) return;
789 
790     // Start by writing data into toybuf.
791 
792     errstart = line;
793     memset(toybuf, 0, sizeof(struct sedcmd));
794     command = (void *)toybuf;
795     reg = toybuf + sizeof(struct sedcmd);
796 
797     // Parse address range (if any)
798     for (i = 0; i < 2; i++) {
799       if (*line == ',') line++;
800       else if (i) break;
801 
802       if (i && *line == '+' && isdigit(line[1])) {
803         line++;
804         command->lmatch[i] = -2-strtol(line, &line, 0);
805       } else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
806       else if (*line == '$') {
807         command->lmatch[i] = -1;
808         line++;
809       } else if (*line == '/' || *line == '\\') {
810         char *s = line;
811 
812         if (!(s = unescape_delimited_string(&line, 0))) goto error;
813         if (!*s) command->rmatch[i] = 0;
814         else {
815           xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
816           command->rmatch[i] = reg-toybuf;
817           reg += sizeof(regex_t);
818         }
819         free(s);
820       } else break;
821     }
822 
823     while (isspace(*line)) line++;
824     if (!*line) break;
825 
826     if (*line == '!') {
827       command->not = 1;
828       line++;
829     }
830     while (isspace(*line)) line++;
831     if (!*line) break;
832 
833     c = command->c = *(line++);
834     if (strchr("}:", c) && i) break;
835     if (strchr("aiqQr=", c) && i>1) break;
836 
837     // Allocate memory and copy out of toybuf now that we know how big it is
838     command = xmemdup(toybuf, reg-toybuf);
839     reg = (reg-toybuf) + (char *)command;
840 
841     // Parse arguments by command type
842     if (c == '{') TT.nextlen++;
843     else if (c == '}') {
844       if (!TT.nextlen--) break;
845     } else if (c == 's') {
846       char *end, delim = 0;
847 
848       // s/pattern/replacement/flags
849 
850       // line continuations use arg1 (back at the start of the function),
851       // so let's fill out arg2 first (since the regex part can't be multiple
852       // lines) and swap them back later.
853 
854       // get pattern (just record, we parse it later)
855       command->arg2 = reg - (char *)command;
856       if (!(TT.remember = unescape_delimited_string(&line, &delim)))
857         goto error;
858 
859       reg += sizeof(regex_t);
860       command->arg1 = reg-(char *)command;
861       command->hit = delim;
862 resume_s:
863       // get replacement - don't replace escapes yet because \1 and \& need
864       // processing later, after we replace \\ with \ we can't tell \\1 from \1
865       end = line;
866       while (*end != command->hit) {
867         if (!*end) goto error;
868         if (*end++ == '\\') {
869           if (!*end || *end == '\n') {
870             end[-1] = '\n';
871             break;
872           }
873           end++;
874         }
875       }
876 
877       reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
878       line = end;
879       // line continuation? (note: '\n' can't be a valid delim).
880       if (*line == command->hit) command->hit = 0;
881       else {
882         if (!*line) continue;
883         reg--;
884         line++;
885         goto resume_s;
886       }
887 
888       // swap arg1/arg2 so they're back in order arguments occur.
889       i = command->arg1;
890       command->arg1 = command->arg2;
891       command->arg2 = i;
892 
893       // get flags
894       for (line++; *line; line++) {
895         long l;
896 
897         if (isspace(*line) && *line != '\n') continue;
898 
899         if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
900         else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
901           command->sflags |= l << 3;
902           line--;
903         } else break;
904       }
905 
906       // We deferred actually parsing the regex until we had the s///i flag
907       // allocating the space was done by extend_string() above
908       if (!*TT.remember) command->arg1 = 0;
909       else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
910         (REG_EXTENDED*!!FLAG(r))|((command->sflags&1)*REG_ICASE));
911       free(TT.remember);
912       TT.remember = 0;
913       if (*line == 'w') {
914         line++;
915         goto writenow;
916       }
917     } else if (c == 'w') {
918       int fd, delim;
919       char *cc;
920 
921       // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
922       // eol status, and to retain the filename for error messages, we'd need
923       // to go up to arg5 just for this. Compromise: dynamically allocate the
924       // filehandle and eol status.
925 
926 writenow:
927       while (isspace(*line)) line++;
928       if (!*line) goto error;
929       for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
930       delim = *cc;
931       *cc = 0;
932       fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
933       *cc = delim;
934 
935       command->w = reg - (char *)command;
936       command = xrealloc(command, command->w+(cc-line)+6);
937       reg = command->w + (char *)command;
938 
939       memcpy(reg, &fd, 4);
940       reg += 4;
941       *(reg++) = 0;
942       memcpy(reg, line, delim);
943       reg += delim;
944       *(reg++) = 0;
945 
946       line = cc;
947       if (delim) line += 2;
948     } else if (c == 'y') {
949       char *s, delim = 0;
950       int len;
951 
952       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
953       command->arg1 = reg-(char *)command;
954       len = strlen(s);
955       reg = extend_string((void *)&command, s, reg-(char *)command, len);
956       free(s);
957       command->arg2 = reg-(char *)command;
958       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
959       if (len != strlen(s)) goto error;
960       reg = extend_string((void *)&command, s, reg-(char*)command, len);
961       free(s);
962     } else if (strchr("abcirtTqQw:", c)) {
963       int end;
964 
965       // trim leading spaces
966       while (isspace(*line) && *line != '\n') line++;
967 
968       // Resume logic differs from 's' case because we don't add a newline
969       // unless it's after something, so we add it on return instead.
970 resume_a:
971       command->hit = 0;
972 
973       // btTqQ: end with space or semicolon, aicrw continue to newline.
974       if (!(end = strcspn(line, strchr(":btTqQ", c) ? "}; \t\r\n\v\f" : "\n"))){
975         // Argument's optional for btTqQ
976         if (strchr("btTqQ", c)) continue;
977         else if (!command->arg1) break;
978       }
979       // Error checking: qQ can only have digits after them
980       if (c=='q' || c=='Q') {
981         for (i = 0; i<end && isdigit(line[i]); i++);
982         if (i != end) {
983           line += i;
984           break;
985         }
986       }
987 
988       // Extend allocation to include new string. We use offsets instead of
989       // pointers so realloc() moving stuff doesn't break things. Ok to write
990       // \n over NUL terminator because call to extend_string() adds it back.
991       if (!command->arg1) command->arg1 = reg - (char*)command;
992       else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
993       else if (!pline) {
994         command->arg1 = 0;
995         continue;
996       }
997       reg = extend_string((void *)&command, line, reg - (char *)command, end);
998 
999       // Recopy data to remove escape sequences and handle line continuation.
1000       if (strchr("aci", c)) {
1001         reg -= end+1;
1002         for (i = end; i; i--) {
1003           if ((*reg++ = *line++)=='\\') {
1004 
1005             // escape at end of line: resume if -e escaped literal newline,
1006             // else request callback and resume with next line
1007             if (!--i) {
1008               *--reg = 0;
1009               if (*line) {
1010                 line++;
1011                 goto resume_a;
1012               }
1013               command->hit = 256;
1014               break;
1015             }
1016             if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
1017             line++;
1018           }
1019         }
1020         *reg = 0;
1021       } else line += end;
1022 
1023     // Commands that take no arguments
1024     } else if (!strchr("{dDgGhHlnNpPx=", c)) break;
1025   }
1026 
1027 error:
1028   error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
1029 }
1030 
sed_main(void)1031 void sed_main(void)
1032 {
1033   struct arg_list *al;
1034   char **args = toys.optargs;
1035 
1036   if (!FLAG(z)) TT.delim = '\n';
1037 
1038   // Lie to autoconf when it asks stupid questions, so configure regexes
1039   // that look for "GNU sed version %f" greater than some old buggy number
1040   // don't fail us for not matching their narrow expectations.
1041   if (FLAG(version)) {
1042     xprintf("This is not GNU sed version 9.0\n");
1043     return;
1044   }
1045 
1046   // Handling our own --version means we handle our own --help too.
1047   if (FLAG(help)) help_exit(0);
1048 
1049   // Parse pattern into commands.
1050 
1051   // If no -e or -f, first argument is the pattern.
1052   if (!TT.e && !TT.f) {
1053     if (!*toys.optargs) error_exit("no pattern");
1054     (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1055   }
1056 
1057   // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1058   // so handle all -e, then all -f. (At least the behavior's consistent.)
1059 
1060   for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1061   parse_pattern(0, 0);
1062   for (al = TT.f; al; al = al->next)
1063     do_lines(xopenro(al->arg), TT.delim, parse_pattern);
1064   dlist_terminate(TT.pattern);
1065   if (TT.nextlen) error_exit("no }");
1066 
1067   TT.fdout = 1;
1068   TT.remember = xstrdup("");
1069 
1070   // Inflict pattern upon input files. Long version because !O_CLOEXEC
1071   loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1072 
1073   // Provide EOF flush at end of cumulative input for non-i mode.
1074   if (!FLAG(i)) {
1075     toys.optflags |= FLAG_i;
1076     sed_line(0, 0);
1077   }
1078 
1079   // todo: need to close fd when done for TOYBOX_FREE?
1080 }
1081