1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6 *
7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8 * but N and s///
9 * TODO: make y// handle unicode, unicode delimiters
10 * TODO: handle error return from emit(), error_msg/exit consistently
11 * What's the right thing to do for -i when write fails? Skip to next?
12 * test '//q' with no previous regex, also repeat previous regex?
13
14 USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
15
16 config SED
17 bool "sed"
18 default y
19 help
20 usage: sed [-inrzE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
21
22 Stream editor. Apply one or more editing SCRIPTs to each line of input
23 (from FILE or stdin) producing output (by default to stdout).
24
25 -e Add SCRIPT to list
26 -f Add contents of SCRIPT_FILE to list
27 -i Edit each file in place (-iEXT keeps backup file with extension EXT)
28 -n No default output (use the p command to output matched lines)
29 -r Use extended regular expression syntax
30 -E POSIX alias for -r
31 -s Treat input files separately (implied by -i)
32 -z Use \0 rather than \n as the input line separator
33
34 A SCRIPT is a series of one or more COMMANDs separated by newlines or
35 semicolons. All -e SCRIPTs are concatenated together as if separated
36 by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
37 If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
38
39 Each COMMAND may be preceded by an address which limits the command to
40 apply only to the specified line(s). Commands without an address apply to
41 every line. Addresses are of the form:
42
43 [ADDRESS[,ADDRESS]][!]COMMAND
44
45 The ADDRESS may be a decimal line number (starting at 1), a /regular
46 expression/ within a pair of forward slashes, or the character "$" which
47 matches the last line of input. (In -s or -i mode this matches the last
48 line of each file, otherwise just the last line of the last file.) A single
49 address matches one line, a pair of comma separated addresses match
50 everything from the first address to the second address (inclusive). If
51 both addresses are regular expressions, more than one range of lines in
52 each file can match. The second address can be +N to end N lines later.
53
54 REGULAR EXPRESSIONS in sed are started and ended by the same character
55 (traditionally / but anything except a backslash or a newline works).
56 Backslashes may be used to escape the delimiter if it occurs in the
57 regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
58 and unicode). An empty regex repeats the previous one. ADDRESS regexes
59 (above) require the first delimiter to be escaped with a backslash when
60 it isn't a forward slash (to distinguish it from the COMMANDs below).
61
62 Sed mostly operates on individual lines one at a time. It reads each line,
63 processes it, and either writes it to the output or discards it before
64 reading the next line. Sed can remember one additional line in a separate
65 buffer (using the h, H, g, G, and x commands), and can read the next line
66 of input early (using the n and N command), but other than that command
67 scripts operate on individual lines of text.
68
69 Each COMMAND starts with a single character. The following commands take
70 no arguments:
71
72 ! Run this command when the test _didn't_ match.
73
74 { Start a new command block, continuing until a corresponding "}".
75 Command blocks may nest. If the block has an address, commands within
76 the block are only run for lines within the block's address range.
77
78 } End command block (this command cannot have an address)
79
80 d Delete this line and move on to the next one
81 (ignores remaining COMMANDs)
82
83 D Delete one line of input and restart command SCRIPT (same as "d"
84 unless you've glued lines together with "N" or similar)
85
86 g Get remembered line (overwriting current line)
87
88 G Get remembered line (appending to current line)
89
90 h Remember this line (overwriting remembered line)
91
92 H Remember this line (appending to remembered line, if any)
93
94 l Print line, escaping \abfrtv (but not newline), octal escaping other
95 nonprintable characters, wrapping lines to terminal width with a
96 backslash, and appending $ to actual end of line.
97
98 n Print default output and read next line, replacing current line
99 (If no next line available, quit processing script)
100
101 N Append next line of input to this line, separated by a newline
102 (This advances the line counter for address matching and "=", if no
103 next line available quit processing script without default output)
104
105 p Print this line
106
107 P Print this line up to first newline (from "N")
108
109 q Quit (print default output, no more commands processed or lines read)
110
111 x Exchange this line with remembered line (overwrite in both directions)
112
113 = Print the current line number (followed by a newline)
114
115 The following commands (may) take an argument. The "text" arguments (to
116 the "a", "b", and "c" commands) may end with an unescaped "\" to append
117 the next line (for which leading whitespace is not skipped), and also
118 treat ";" as a literal character (use "\;" instead).
119
120 a [text] Append text to output before attempting to read next line
121
122 b [label] Branch, jumps to :label (or with no label, to end of SCRIPT)
123
124 c [text] Delete line, output text at end of matching address range
125 (ignores remaining COMMANDs)
126
127 i [text] Print text
128
129 r [file] Append contents of file to output before attempting to read
130 next line.
131
132 s/S/R/F Search for regex S, replace matched text with R using flags F.
133 The first character after the "s" (anything but newline or
134 backslash) is the delimiter, escape with \ to use normally.
135
136 The replacement text may contain "&" to substitute the matched
137 text (escape it with backslash for a literal &), or \1 through
138 \9 to substitute a parenthetical subexpression in the regex.
139 You can also use the normal backslash escapes such as \n and
140 a backslash at the end of the line appends the next line.
141
142 The flags are:
143
144 [0-9] A number, substitute only that occurrence of pattern
145 g Global, substitute all occurrences of pattern
146 i Ignore case when matching
147 p Print the line if match was found and replaced
148 w [file] Write (append) line to file if match replaced
149
150 t [label] Test, jump to :label only if an "s" command found a match in
151 this line since last test (replacing with same text counts)
152
153 T [label] Test false, jump only if "s" hasn't found a match.
154
155 w [file] Write (append) line to file
156
157 y/old/new/ Change each character in 'old' to corresponding character
158 in 'new' (with standard backslash escapes, delimiter can be
159 any repeated character except \ or \n)
160
161 : [label] Labeled target for jump commands
162
163 # Comment, ignore rest of this line of SCRIPT
164
165 Deviations from POSIX: allow extended regular expressions with -r,
166 editing in place with -i, separate with -s, NUL-separated input with -z,
167 printf escapes in text, line continuations, semicolons after all commands,
168 2-address anywhere an address is allowed, "T" command, multiline
169 continuations for [abc], \; to end [abc] argument before end of line.
170 */
171
172 #define FOR_sed
173 #include "toys.h"
174
175 GLOBALS(
176 char *i;
177 struct arg_list *f, *e;
178
179 // processed pattern list
180 struct double_list *pattern;
181
182 char *nextline, *remember;
183 void *restart, *lastregex;
184 long nextlen, rememberlen, count;
185 int fdout, noeol;
186 unsigned xx;
187 char delim;
188 )
189
190 // Linked list of parsed sed commands. Offset fields indicate location where
191 // regex or string starts, ala offset+(char *)struct, because we remalloc()
192 // these to expand them for multiline inputs, and pointers would have to be
193 // individually adjusted.
194
195 struct sedcmd {
196 struct sedcmd *next, *prev;
197
198 // Begin and end of each match
199 long lmatch[2]; // line number of match
200 int rmatch[2]; // offset of regex struct for prefix matches (/abc/,/def/p)
201 int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
202 unsigned not, hit;
203 unsigned sflags; // s///flag bits: i=1, g=2, p=4
204 char c; // action
205 };
206
207 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)208 static int emit(char *line, long len, int eol)
209 {
210 int l, old = line[len];
211
212 if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
213 TT.noeol = !eol;
214 if (eol) line[len++] = '\n';
215 if (!len) return 0;
216 l = writeall(TT.fdout, line, len);
217 if (eol) line[len-1] = old;
218 if (l != len) {
219 if (TT.fdout != 1) perror_msg("short write");
220
221 return 1;
222 }
223
224 return 0;
225 }
226
227 // Extend allocation to include new string, with newline between if newlen<0
228
extend_string(char ** old,char * new,int oldlen,int newlen)229 static char *extend_string(char **old, char *new, int oldlen, int newlen)
230 {
231 int newline = newlen < 0;
232 char *s;
233
234 if (newline) newlen = -newlen;
235 s = *old = xrealloc(*old, oldlen+newlen+newline+1);
236 if (newline) s[oldlen++] = '\n';
237 memcpy(s+oldlen, new, newlen);
238 s[oldlen+newlen] = 0;
239
240 return s+oldlen+newlen+1;
241 }
242
243 // An empty regex repeats the previous one
get_regex(void * command,int offset)244 static void *get_regex(void *command, int offset)
245 {
246 if (!offset) {
247 if (!TT.lastregex) error_exit("no previous regex");
248 return TT.lastregex;
249 }
250
251 return TT.lastregex = offset+(char *)command;
252 }
253
254 // Apply pattern to line from input file
sed_line(char ** pline,long plen)255 static void sed_line(char **pline, long plen)
256 {
257 struct append {
258 struct append *next, *prev;
259 int file;
260 char *str;
261 } *append = 0;
262 char *line = TT.nextline;
263 long len = TT.nextlen;
264 struct sedcmd *command;
265 int eol = 0, tea = 0;
266
267 // Ignore EOF for all files before last unless -i
268 if (!pline && !FLAG(i)) return;
269
270 // Grab next line for deferred processing (EOF detection: we get a NULL
271 // pline at EOF to flush last line). Note that only end of _last_ input
272 // file matches $ (unless we're doing -i).
273 TT.nextline = 0;
274 TT.nextlen = 0;
275 if (pline) {
276 TT.nextline = *pline;
277 TT.nextlen = plen;
278 *pline = 0;
279 }
280
281 if (!line || !len) return;
282 if (line[len-1] == '\n') line[--len] = eol++;
283 TT.count++;
284
285 // The restart-1 is because we added one to make sure it wasn't NULL,
286 // otherwise N as last command would restart script
287 command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
288 TT.restart = 0;
289
290 while (command) {
291 char *str, c = command->c;
292
293 // Have we got a line or regex matching range for this rule?
294 if (*command->lmatch || *command->rmatch) {
295 int miss = 0;
296 long lm;
297
298 // In a match that might end?
299 if (command->hit) {
300 if (!(lm = command->lmatch[1])) {
301 if (!command->rmatch[1]) command->hit = 0;
302 else {
303 void *rm = get_regex(command, command->rmatch[1]);
304
305 // regex match end includes matching line, so defer deactivation
306 if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
307 }
308 } else if (lm > 0 && lm < TT.count) command->hit = 0;
309 else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0;
310
311 // Start a new match?
312 } else {
313 if (!(lm = *command->lmatch)) {
314 void *rm = get_regex(command, *command->rmatch);
315
316 if (line && !regexec0(rm, line, len, 0, 0, 0))
317 command->hit = TT.count;
318 } else if (lm == TT.count || (lm == -1 && !pline))
319 command->hit = TT.count;
320
321 if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
322 }
323
324 // Didn't match?
325 lm = !(command->not^!!command->hit);
326
327 // Deferred disable from regex end match
328 if (miss || command->lmatch[1] == TT.count) command->hit = 0;
329
330 if (lm) {
331 // Handle skipping curly bracket command group
332 if (c == '{') {
333 int curly = 1;
334
335 while (curly) {
336 command = command->next;
337 if (command->c == '{') curly++;
338 if (command->c == '}') curly--;
339 }
340 }
341 command = command->next;
342 continue;
343 }
344 }
345
346 // A deleted line can still update line match state for later commands
347 if (!line) {
348 command = command->next;
349 continue;
350 }
351
352 // Process command
353
354 if (c=='a' || c=='r') {
355 struct append *a = xzalloc(sizeof(struct append));
356 if (command->arg1) a->str = command->arg1+(char *)command;
357 a->file = c=='r';
358 dlist_add_nomalloc((void *)&append, (void *)a);
359 } else if (c=='b' || c=='t' || c=='T') {
360 int t = tea;
361
362 if (c != 'b') tea = 0;
363 if (c=='b' || t^(c=='T')) {
364 if (!command->arg1) break;
365 str = command->arg1+(char *)command;
366 for (command = (void *)TT.pattern; command; command = command->next)
367 if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
368 break;
369 if (!command) error_exit("no :%s", str);
370 }
371 } else if (c=='c') {
372 str = command->arg1+(char *)command;
373 if (!command->hit) emit(str, strlen(str), 1);
374 free(line);
375 line = 0;
376 continue;
377 } else if (c=='d') {
378 free(line);
379 line = 0;
380 continue;
381 } else if (c=='D') {
382 // Delete up to \n or end of buffer
383 str = line;
384 while ((str-line)<len) if (*(str++) == '\n') break;
385 len -= str - line;
386 memmove(line, str, len);
387
388 // if "delete" blanks line, disable further processing
389 // otherwise trim and restart script
390 if (!len) {
391 free(line);
392 line = 0;
393 } else {
394 line[len] = 0;
395 command = (void *)TT.pattern;
396 }
397 continue;
398 } else if (c=='g') {
399 free(line);
400 line = xstrdup(TT.remember);
401 len = TT.rememberlen;
402 } else if (c=='G') {
403 line = xrealloc(line, len+TT.rememberlen+2);
404 line[len++] = '\n';
405 memcpy(line+len, TT.remember, TT.rememberlen);
406 line[len += TT.rememberlen] = 0;
407 } else if (c=='h') {
408 free(TT.remember);
409 TT.remember = xstrdup(line);
410 TT.rememberlen = len;
411 } else if (c=='H') {
412 TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
413 TT.remember[TT.rememberlen++] = '\n';
414 memcpy(TT.remember+TT.rememberlen, line, len);
415 TT.remember[TT.rememberlen += len] = 0;
416 } else if (c=='i') {
417 str = command->arg1+(char *)command;
418 emit(str, strlen(str), 1);
419 } else if (c=='l') {
420 int i, x, off;
421
422 if (!TT.xx) {
423 terminal_size(&TT.xx, 0);
424 if (!TT.xx) TT.xx = 80;
425 if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
426 if (TT.xx > 4) TT.xx -= 4;
427 }
428
429 for (i = off = 0; i<len; i++) {
430 if (off >= TT.xx) {
431 toybuf[off++] = '\\';
432 emit(toybuf, off, 1);
433 off = 0;
434 }
435 x = stridx("\\\a\b\f\r\t\v", line[i]);
436 if (x != -1) {
437 toybuf[off++] = '\\';
438 toybuf[off++] = "\\abfrtv"[x];
439 } else if (line[i] >= ' ') toybuf[off++] = line[i];
440 else off += sprintf(toybuf+off, "\\%03o", line[i]);
441 }
442 toybuf[off++] = '$';
443 emit(toybuf, off, 1);
444 } else if (c=='n') {
445 TT.restart = command->next+1;
446
447 break;
448 } else if (c=='N') {
449 // Can't just grab next line because we could have multiple N and
450 // we need to actually read ahead to get N;$p EOF detection right.
451 if (pline) {
452 TT.restart = command->next+1;
453 extend_string(&line, TT.nextline, len, -TT.nextlen);
454 free(TT.nextline);
455 TT.nextline = line;
456 TT.nextlen += len + 1;
457 line = 0;
458 }
459
460 // Pending append goes out right after N
461 goto done;
462 } else if (c=='p' || c=='P') {
463 char *l = (c=='P') ? strchr(line, '\n') : 0;
464
465 if (emit(line, l ? l-line : len, eol)) break;
466 } else if (c=='q' || c=='Q') {
467 if (pline) *pline = (void *)1;
468 free(TT.nextline);
469 if (!toys.exitval && command->arg1)
470 toys.exitval = atoi(command->arg1+(char *)command);
471 TT.nextline = 0;
472 TT.nextlen = 0;
473 if (c=='Q') line = 0;
474
475 break;
476 } else if (c=='s') {
477 char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0;
478 regmatch_t *match = (void *)toybuf;
479 regex_t *reg = get_regex(command, command->arg1);
480 int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0,
481 mlen, off, newlen;
482
483 // Loop finding match in remaining line (up to remaining len)
484 while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) {
485 mflags = REG_NOTBOL;
486
487 // Zero length matches don't count immediately after a previous match
488 mlen = match[0].rm_eo-match[0].rm_so;
489 if (!mlen && !zmatch) {
490 if (rline-line == len) break;
491 l2[l2used++] = *rline++;
492 zmatch++;
493 continue;
494 } else zmatch = 0;
495
496 // If we're replacing only a specific match, skip if this isn't it
497 off = command->sflags>>3;
498 if (off && off != ++count) {
499 memcpy(l2+l2used, rline, match[0].rm_eo);
500 l2used += match[0].rm_eo;
501 rline += match[0].rm_eo;
502
503 continue;
504 }
505 // The fact getline() can allocate unbounded amounts of memory is
506 // a bigger issue, but while we're here check for integer overflow
507 if (match[0].rm_eo > INT_MAX) perror_exit(0);
508
509 // newlen = strlen(new) but with \1 and & and printf escapes
510 for (off = newlen = 0; new[off]; off++) {
511 int cc = -1;
512
513 if (new[off] == '&') cc = 0;
514 else if (new[off] == '\\') cc = new[++off] - '0';
515 if (cc < 0 || cc > 9) {
516 newlen++;
517 continue;
518 }
519 newlen += match[cc].rm_eo-match[cc].rm_so;
520 }
521
522 // Copy changed data to new string
523
524 // Adjust allocation size of new string, copy data we know we'll keep
525 l2l += newlen-mlen;
526 if ((l2l|0xfff) > l2old) l2 = xrealloc(l2, l2old = (l2l|0xfff)+1);
527 if (match[0].rm_so) {
528 memcpy(l2+l2used, rline, match[0].rm_so);
529 l2used += match[0].rm_so;
530 }
531
532 // copy in new replacement text
533 for (off = mlen = 0; new[off]; off++) {
534 int cc = 0, ll;
535
536 if (new[off] == '\\') {
537 cc = new[++off] - '0';
538 if (cc<0 || cc>9) {
539 if (!(l2[l2used+mlen++] = unescape(new[off])))
540 l2[l2used+mlen-1] = new[off];
541
542 continue;
543 } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
544 } else if (new[off] != '&') {
545 l2[l2used+mlen++] = new[off];
546
547 continue;
548 }
549
550 if (match[cc].rm_so != -1) {
551 ll = match[cc].rm_eo-match[cc].rm_so;
552 memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll);
553 mlen += ll;
554 }
555 }
556 l2used += newlen;
557 rline += match[0].rm_eo;
558
559 // Stop after first substitution unless we have flag g
560 if (!(command->sflags & 2)) break;
561 }
562
563 // If we made any changes, finish off l2 and swap it for line
564 if (l2) {
565 // grab trailing unmatched data and null terminator, swap with original
566 mlen = len-(rline-line);
567 memcpy(l2+l2used, rline, mlen+1);
568 len = l2used + mlen;
569 free(line);
570 line = l2;
571 }
572
573 if (mflags) {
574 // flag p
575 if (command->sflags & 4) emit(line, len, eol);
576
577 tea = 1;
578 if (command->w) goto writenow;
579 }
580 } else if (c=='w') {
581 int fd, noeol;
582 char *name;
583
584 writenow:
585 // Swap out emit() context
586 fd = TT.fdout;
587 noeol = TT.noeol;
588
589 // We save filehandle and newline status before filename
590 name = command->w + (char *)command;
591 memcpy(&TT.fdout, name, 4);
592 name += 4;
593 TT.noeol = *(name++);
594
595 // write, then save/restore context
596 if (emit(line, len, eol))
597 perror_exit("w '%s'", command->arg1+(char *)command);
598 *(--name) = TT.noeol;
599 TT.noeol = noeol;
600 TT.fdout = fd;
601 } else if (c=='x') {
602 long swap = TT.rememberlen;
603
604 str = TT.remember;
605 TT.remember = line;
606 line = str;
607 TT.rememberlen = len;
608 len = swap;
609 } else if (c=='y') {
610 char *from, *to = (char *)command;
611 int i, j;
612
613 from = to+command->arg1;
614 to += command->arg2;
615
616 for (i = 0; i < len; i++) {
617 j = stridx(from, line[i]);
618 if (j != -1) line[i] = to[j];
619 }
620 } else if (c=='=') {
621 sprintf(toybuf, "%ld", TT.count);
622 if (emit(toybuf, strlen(toybuf), 1)) break;
623 }
624
625 command = command->next;
626 }
627
628 if (line && !FLAG(n)) emit(line, len, eol);
629
630 done:
631 if (dlist_terminate(append)) while (append) {
632 struct append *a = append->next;
633
634 if (append->file) {
635 int fd = open(append->str, O_RDONLY);
636
637 // Force newline if noeol pending
638 if (fd != -1) {
639 if (TT.noeol) xwrite(TT.fdout, "\n", 1);
640 TT.noeol = 0;
641 xsendfile(fd, TT.fdout);
642 close(fd);
643 }
644 } else if (append->str) emit(append->str, strlen(append->str), 1);
645 else emit(line, 0, 0);
646 free(append);
647 append = a;
648 }
649 free(line);
650 }
651
652 // Callback called on each input file
do_sed_file(int fd,char * name)653 static void do_sed_file(int fd, char *name)
654 {
655 char *tmp;
656
657 if (FLAG(i)) {
658 struct sedcmd *command;
659
660 if (!fd) return error_msg("-i on stdin");
661 TT.fdout = copy_tempfile(fd, name, &tmp);
662 TT.count = 0;
663 for (command = (void *)TT.pattern; command; command = command->next)
664 command->hit = 0;
665 }
666 do_lines(fd, TT.delim, sed_line);
667 if (FLAG(i)) {
668 if (TT.i && *TT.i) {
669 char *s = xmprintf("%s%s", name, TT.i);
670
671 xrename(name, s);
672 free(s);
673 }
674 replace_tempfile(-1, TT.fdout, &tmp);
675 TT.fdout = 1;
676 TT.nextline = 0;
677 TT.nextlen = TT.noeol = 0;
678 }
679 }
680
681 // Copy chunk of string between two delimiters, converting printf escapes.
682 // returns processed copy of string (0 if error), *pstr advances to next
683 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
684 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim)685 static char *unescape_delimited_string(char **pstr, char *delim)
686 {
687 char *to, *from, mode = 0, d;
688
689 // Grab leading delimiter (if necessary), allocate space for new string
690 from = *pstr;
691 if (!delim || !*delim) {
692 if (!(d = *(from++))) return 0;
693 if (d == '\\') d = *(from++);
694 if (!d || d == '\\') return 0;
695 if (delim) *delim = d;
696 } else d = *delim;
697 to = delim = xmalloc(strlen(*pstr)+1);
698
699 while (mode || *from != d) {
700 if (!*from) return 0;
701
702 // delimiter in regex character range doesn't count
703 if (*from == '[') {
704 if (!mode) {
705 mode = ']';
706 if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
707 } else if (mode == ']' && strchr(".=:", from[1])) {
708 *(to++) = *(from++);
709 mode = *from;
710 }
711 } else if (*from == mode) {
712 if (mode == ']') mode = 0;
713 else {
714 *(to++) = *(from++);
715 mode = ']';
716 }
717 // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
718 // but the perl build does it, so we need to filter it out.
719 } else if (mode && *from == '-' && from[-1] == from[1]) {
720 from+=2;
721 continue;
722 } else if (*from == '\\') {
723 if (!from[1]) return 0;
724
725 // Check escaped end delimiter before printf style escapes.
726 if (from[1] == d) from++;
727 else if (from[1]=='\\') *(to++) = *(from++);
728 else {
729 char c = unescape(from[1]);
730
731 if (c) {
732 *(to++) = c;
733 from+=2;
734 continue;
735 } else if (!mode) *(to++) = *(from++);
736 }
737 }
738 *(to++) = *(from++);
739 }
740 *to = 0;
741 *pstr = from+1;
742
743 return delim;
744 }
745
746 // Translate pattern strings into command structures. Each command structure
747 // is a single allocation (which requires some math and remalloc at times).
parse_pattern(char ** pline,long len)748 static void parse_pattern(char **pline, long len)
749 {
750 struct sedcmd *command = (void *)TT.pattern;
751 char *line, *reg, c, *errstart;
752 int i;
753
754 line = errstart = pline ? *pline : "";
755 if (len && line[len-1]=='\n') line[--len] = 0;
756
757 // Append this line to previous multiline command? (hit indicates type.)
758 // During parsing "hit" stores data about line continuations, but in
759 // sed_line() it means the match range attached to this command
760 // is active, so processing the continuation must zero it again.
761 if (command && command->prev->hit) {
762 // Remove half-finished entry from list so remalloc() doesn't confuse it
763 TT.pattern = TT.pattern->prev;
764 command = dlist_pop(&TT.pattern);
765 c = command->c;
766 reg = (char *)command;
767 reg += command->arg1 + strlen(reg + command->arg1);
768
769 // Resume parsing for 'a' or 's' command. (Only two that can do this.)
770 // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
771 // a unicode character.
772 if (command->hit < 256) goto resume_s;
773 else goto resume_a;
774 }
775
776 // Loop through commands in this line.
777
778 command = 0;
779 for (;;) {
780 if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
781
782 // If there's no more data on this line, return.
783 for (;;) {
784 while (isspace(*line) || *line == ';') line++;
785 if (*line == '#') while (*line && *line != '\n') line++;
786 else break;
787 }
788 if (!*line) return;
789
790 // Start by writing data into toybuf.
791
792 errstart = line;
793 memset(toybuf, 0, sizeof(struct sedcmd));
794 command = (void *)toybuf;
795 reg = toybuf + sizeof(struct sedcmd);
796
797 // Parse address range (if any)
798 for (i = 0; i < 2; i++) {
799 if (*line == ',') line++;
800 else if (i) break;
801
802 if (i && *line == '+' && isdigit(line[1])) {
803 line++;
804 command->lmatch[i] = -2-strtol(line, &line, 0);
805 } else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
806 else if (*line == '$') {
807 command->lmatch[i] = -1;
808 line++;
809 } else if (*line == '/' || *line == '\\') {
810 char *s = line;
811
812 if (!(s = unescape_delimited_string(&line, 0))) goto error;
813 if (!*s) command->rmatch[i] = 0;
814 else {
815 xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
816 command->rmatch[i] = reg-toybuf;
817 reg += sizeof(regex_t);
818 }
819 free(s);
820 } else break;
821 }
822
823 while (isspace(*line)) line++;
824 if (!*line) break;
825
826 if (*line == '!') {
827 command->not = 1;
828 line++;
829 }
830 while (isspace(*line)) line++;
831 if (!*line) break;
832
833 c = command->c = *(line++);
834 if (strchr("}:", c) && i) break;
835 if (strchr("aiqQr=", c) && i>1) break;
836
837 // Allocate memory and copy out of toybuf now that we know how big it is
838 command = xmemdup(toybuf, reg-toybuf);
839 reg = (reg-toybuf) + (char *)command;
840
841 // Parse arguments by command type
842 if (c == '{') TT.nextlen++;
843 else if (c == '}') {
844 if (!TT.nextlen--) break;
845 } else if (c == 's') {
846 char *end, delim = 0;
847
848 // s/pattern/replacement/flags
849
850 // line continuations use arg1 (back at the start of the function),
851 // so let's fill out arg2 first (since the regex part can't be multiple
852 // lines) and swap them back later.
853
854 // get pattern (just record, we parse it later)
855 command->arg2 = reg - (char *)command;
856 if (!(TT.remember = unescape_delimited_string(&line, &delim)))
857 goto error;
858
859 reg += sizeof(regex_t);
860 command->arg1 = reg-(char *)command;
861 command->hit = delim;
862 resume_s:
863 // get replacement - don't replace escapes yet because \1 and \& need
864 // processing later, after we replace \\ with \ we can't tell \\1 from \1
865 end = line;
866 while (*end != command->hit) {
867 if (!*end) goto error;
868 if (*end++ == '\\') {
869 if (!*end || *end == '\n') {
870 end[-1] = '\n';
871 break;
872 }
873 end++;
874 }
875 }
876
877 reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
878 line = end;
879 // line continuation? (note: '\n' can't be a valid delim).
880 if (*line == command->hit) command->hit = 0;
881 else {
882 if (!*line) continue;
883 reg--;
884 line++;
885 goto resume_s;
886 }
887
888 // swap arg1/arg2 so they're back in order arguments occur.
889 i = command->arg1;
890 command->arg1 = command->arg2;
891 command->arg2 = i;
892
893 // get flags
894 for (line++; *line; line++) {
895 long l;
896
897 if (isspace(*line) && *line != '\n') continue;
898
899 if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
900 else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
901 command->sflags |= l << 3;
902 line--;
903 } else break;
904 }
905
906 // We deferred actually parsing the regex until we had the s///i flag
907 // allocating the space was done by extend_string() above
908 if (!*TT.remember) command->arg1 = 0;
909 else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
910 (REG_EXTENDED*!!FLAG(r))|((command->sflags&1)*REG_ICASE));
911 free(TT.remember);
912 TT.remember = 0;
913 if (*line == 'w') {
914 line++;
915 goto writenow;
916 }
917 } else if (c == 'w') {
918 int fd, delim;
919 char *cc;
920
921 // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
922 // eol status, and to retain the filename for error messages, we'd need
923 // to go up to arg5 just for this. Compromise: dynamically allocate the
924 // filehandle and eol status.
925
926 writenow:
927 while (isspace(*line)) line++;
928 if (!*line) goto error;
929 for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
930 delim = *cc;
931 *cc = 0;
932 fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
933 *cc = delim;
934
935 command->w = reg - (char *)command;
936 command = xrealloc(command, command->w+(cc-line)+6);
937 reg = command->w + (char *)command;
938
939 memcpy(reg, &fd, 4);
940 reg += 4;
941 *(reg++) = 0;
942 memcpy(reg, line, delim);
943 reg += delim;
944 *(reg++) = 0;
945
946 line = cc;
947 if (delim) line += 2;
948 } else if (c == 'y') {
949 char *s, delim = 0;
950 int len;
951
952 if (!(s = unescape_delimited_string(&line, &delim))) goto error;
953 command->arg1 = reg-(char *)command;
954 len = strlen(s);
955 reg = extend_string((void *)&command, s, reg-(char *)command, len);
956 free(s);
957 command->arg2 = reg-(char *)command;
958 if (!(s = unescape_delimited_string(&line, &delim))) goto error;
959 if (len != strlen(s)) goto error;
960 reg = extend_string((void *)&command, s, reg-(char*)command, len);
961 free(s);
962 } else if (strchr("abcirtTqQw:", c)) {
963 int end;
964
965 // trim leading spaces
966 while (isspace(*line) && *line != '\n') line++;
967
968 // Resume logic differs from 's' case because we don't add a newline
969 // unless it's after something, so we add it on return instead.
970 resume_a:
971 command->hit = 0;
972
973 // btTqQ: end with space or semicolon, aicrw continue to newline.
974 if (!(end = strcspn(line, strchr(":btTqQ", c) ? "}; \t\r\n\v\f" : "\n"))){
975 // Argument's optional for btTqQ
976 if (strchr("btTqQ", c)) continue;
977 else if (!command->arg1) break;
978 }
979 // Error checking: qQ can only have digits after them
980 if (c=='q' || c=='Q') {
981 for (i = 0; i<end && isdigit(line[i]); i++);
982 if (i != end) {
983 line += i;
984 break;
985 }
986 }
987
988 // Extend allocation to include new string. We use offsets instead of
989 // pointers so realloc() moving stuff doesn't break things. Ok to write
990 // \n over NUL terminator because call to extend_string() adds it back.
991 if (!command->arg1) command->arg1 = reg - (char*)command;
992 else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
993 else if (!pline) {
994 command->arg1 = 0;
995 continue;
996 }
997 reg = extend_string((void *)&command, line, reg - (char *)command, end);
998
999 // Recopy data to remove escape sequences and handle line continuation.
1000 if (strchr("aci", c)) {
1001 reg -= end+1;
1002 for (i = end; i; i--) {
1003 if ((*reg++ = *line++)=='\\') {
1004
1005 // escape at end of line: resume if -e escaped literal newline,
1006 // else request callback and resume with next line
1007 if (!--i) {
1008 *--reg = 0;
1009 if (*line) {
1010 line++;
1011 goto resume_a;
1012 }
1013 command->hit = 256;
1014 break;
1015 }
1016 if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
1017 line++;
1018 }
1019 }
1020 *reg = 0;
1021 } else line += end;
1022
1023 // Commands that take no arguments
1024 } else if (!strchr("{dDgGhHlnNpPx=", c)) break;
1025 }
1026
1027 error:
1028 error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
1029 }
1030
sed_main(void)1031 void sed_main(void)
1032 {
1033 struct arg_list *al;
1034 char **args = toys.optargs;
1035
1036 if (!FLAG(z)) TT.delim = '\n';
1037
1038 // Lie to autoconf when it asks stupid questions, so configure regexes
1039 // that look for "GNU sed version %f" greater than some old buggy number
1040 // don't fail us for not matching their narrow expectations.
1041 if (FLAG(version)) {
1042 xprintf("This is not GNU sed version 9.0\n");
1043 return;
1044 }
1045
1046 // Handling our own --version means we handle our own --help too.
1047 if (FLAG(help)) help_exit(0);
1048
1049 // Parse pattern into commands.
1050
1051 // If no -e or -f, first argument is the pattern.
1052 if (!TT.e && !TT.f) {
1053 if (!*toys.optargs) error_exit("no pattern");
1054 (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1055 }
1056
1057 // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1058 // so handle all -e, then all -f. (At least the behavior's consistent.)
1059
1060 for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1061 parse_pattern(0, 0);
1062 for (al = TT.f; al; al = al->next)
1063 do_lines(xopenro(al->arg), TT.delim, parse_pattern);
1064 dlist_terminate(TT.pattern);
1065 if (TT.nextlen) error_exit("no }");
1066
1067 TT.fdout = 1;
1068 TT.remember = xstrdup("");
1069
1070 // Inflict pattern upon input files. Long version because !O_CLOEXEC
1071 loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1072
1073 // Provide EOF flush at end of cumulative input for non-i mode.
1074 if (!FLAG(i)) {
1075 toys.optflags |= FLAG_i;
1076 sed_line(0, 0);
1077 }
1078
1079 // todo: need to close fd when done for TOYBOX_FREE?
1080 }
1081