• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* cut.c - print selected ranges from a file
2  *
3  * Copyright 2016 Rob Landley <rob@landley.net>
4  *
5  * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
6  *
7  * Deviations from posix: added -DF. We can only accept 512 selections, and
8  * "-" counts as start to end. Using spaces to separate a comma-separated list
9  * is silly and inconsistent with dd, ps, cp, and mount.
10  *
11  * todo: -n, -s with -c
12 
13 USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN))
14 
15 config CUT
16   bool "cut"
17   default y
18   help
19     usage: cut [-Ds] [-bcfF LIST] [-dO DELIM] [FILE...]
20 
21     Print selected parts of lines from each FILE to standard output.
22 
23     Each selection LIST is comma separated, either numbers (counting from 1)
24     or dash separated ranges (inclusive, with X- meaning to end of line and -X
25     from start). By default selection ranges are sorted and collated, use -D
26     to prevent that.
27 
28     -b	select bytes
29     -c	select UTF-8 characters
30     -C	select unicode columns
31     -d	use DELIM (default is TAB for -f, run of whitespace for -F)
32     -D	Don't sort/collate selections
33     -f	select fields (words) separated by single DELIM character
34     -F	select fields separated by DELIM regex
35     -O	output delimiter (default one space for -F, input delim for -f)
36     -s	skip lines without delimiters
37 */
38 #define FOR_cut
39 #include "toys.h"
40 
GLOBALS(char * d;char * O;struct arg_list * select[5];int pairs;regex_t reg;)41 GLOBALS(
42   char *d;
43   char *O;
44   struct arg_list *select[5]; // we treat them the same, so loop through
45 
46   int pairs;
47   regex_t reg;
48 )
49 
50 // Return number of bytes to start of first column fitting in columns
51 // invalid sequences are skipped/ignored
52 int unicolumns(char *start, unsigned columns)
53 {
54   int i, j = 0;
55   wchar_t wc;
56   char *s = start, *ss = start;
57 
58   // Skip start, rounding down if we hit a multicolumn char
59   while (j<columns && (i = utf8towc(&wc, s, 4))) {
60     if (i<0) s++;
61     else {
62       s += i;
63       if (0<(i = wcwidth(wc))) {
64         if ((j += i)>columns) break;
65         ss = s;
66       }
67     }
68   }
69 
70   return ss-start;
71 }
72 
73 
74 // Apply selections to an input line, producing output
cut_line(char ** pline,long len)75 static void cut_line(char **pline, long len)
76 {
77   unsigned *pairs = (void *)toybuf;
78   char *line = *pline;
79   int i, j;
80 
81   if (len && line[len-1]=='\n') line[--len] = 0;
82 
83   // Loop through selections
84   for (i=0; i<TT.pairs; i++) {
85     unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
86     char *s = line, *ss;
87 
88     // input: start/end position, count=difference between them
89     // output: s = start of string, len = bytes to output
90 
91     if (start) start--;
92     if (start>=len) continue;
93     if (!end || end>len) end = len;
94     count = end-start;
95 
96     // Find start and end of output string for the relevant selection type
97     if (toys.optflags&FLAG_b) s += start;
98     else if (toys.optflags&FLAG_C) {
99       // crunch_str() currently assumes that combining characters get
100       // escaped, to provide an unambiguous visual representation.
101       // This assumes the input string is null terminated.
102       //if (start) crunch_str(&s, start, 0, 0, 0);
103       //if (!*s) continue;
104       //start = s-line;
105       //ss = s;
106       //crunch_str(&ss, count, 0, 0, 0);
107       //count = ss-s;
108 
109       s += unicolumns(s, start);
110       count = unicolumns(s, end-start);
111     } else if (toys.optflags&FLAG_c) {
112       wchar_t wc;
113       char *sss;
114 
115       // Find start
116       ss = line+len;
117       while (start && s<ss) {
118         if (0<=(j = utf8towc(&wc, s, len))) start--;
119         s += (j<1) ? 1 : j;
120       }
121       if (s == ss) continue;
122 
123       // Find end
124       end = count;
125       sss = s;
126       while (end && sss<ss) {
127         if (0<=(j = utf8towc(&wc, sss, len))) end--;
128         sss += (j<1) ? 1 : j;
129       }
130       count = sss-s;
131     } else {
132       regmatch_t match;
133 
134       // Loop through skipping appropriate number of fields
135       for (j = 0; j<2; j++) {
136         ss = s;
137         if (j) start = count;
138         else end = start;
139         while (*ss && start) {
140           if (toys.optflags&FLAG_f) {
141             if (!strchr(TT.d, *ss++)) continue;
142             if (!--start && j) ss--;
143           } else {
144             if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
145               ss = line+len;
146               continue;
147             }
148             if (!match.rm_eo) break; // zero length match == no delimiter
149             ss += (!--start && j) ? match.rm_so : match.rm_eo;
150           }
151         }
152         if (!j && !*(s = ss)) break;
153       }
154 
155       // If we never encountered even one separator, print whole line (posix!)
156       if (!j && end == start) {
157         if (toys.optflags&FLAG_s) return;
158         fwrite(line, len, 1, stdout);
159         break;
160       } else if (!*s) continue;
161       count = ss-s;
162     }
163     if (i && TT.O) fputs(TT.O, stdout);
164     fwrite(s, count, 1, stdout);
165   }
166   xputc('\n');
167 }
168 
compar(unsigned * a,unsigned * b)169 static int compar(unsigned *a, unsigned *b)
170 {
171   if (*a<*b) return -1;
172   if (*a>*b) return 1;
173   if (a[1]<b[1]) return -1;
174   if (a[1]>b[1]) return 1;
175 
176   return 0;
177 }
178 
179 // parse A or A-B or A- or -B
get_range(void * data,char * str,int len)180 static char *get_range(void *data, char *str, int len)
181 {
182   char *end = str;
183   unsigned *pairs = (void *)toybuf, i;
184 
185   // Using toybuf[] to store ranges means we can have 512 selections max.
186   if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
187   pairs += 2*TT.pairs++;
188 
189   pairs[1] = UINT_MAX;
190   for (i = 0; ;i++) {
191     if (i==2) return end;
192     if (isdigit(*end)) {
193       long long ll = estrtol(end, &end, 10);
194 
195       if (ll<1 || ll>UINT_MAX || errno) return end;
196       pairs[i] = ll;
197     }
198     if (*end++ != '-') break;
199   }
200   if (!i) pairs[1] = pairs[0];
201   if ((end-str)<len) return end;
202   if (pairs[0]>pairs[1]) return str;
203 
204   // No error
205   return 0;
206 }
207 
cut_main(void)208 void cut_main(void)
209 {
210   int i;
211   char buf[8];
212 
213   // Parse command line arguments
214   if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
215     error_exit("-s needs -Ff");
216   if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
217     error_exit("-d needs -Ff");
218   if (!TT.d) TT.d = (toys.optflags&FLAG_F) ? "[[:space:]][[:space:]]*" : "\t";
219   if (toys.optflags&FLAG_F) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
220   if (!TT.O) {
221     if (toys.optflags&FLAG_F) TT.O = " ";
222     else if (toys.optflags&FLAG_f) TT.O = TT.d;
223   }
224 
225   // Parse ranges, which are attached to a selection type (only one can be set)
226   for (i = 0; i<ARRAY_LEN(TT.select); i++) {
227     sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
228     if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
229   }
230   if (!TT.pairs) error_exit("no selections");
231 
232   // Sort and collate selections
233   if (!(toys.optflags&FLAG_D)) {
234     int from, to;
235     unsigned *pairs = (void *)toybuf;
236 
237     qsort(toybuf, TT.pairs, 8, (void *)compar);
238     for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
239       if (pairs[from] > pairs[to+1]) {
240         to += 2;
241         memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
242       } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
243     }
244     TT.pairs = (to/2)+1;
245   }
246 
247   // For each argument, loop through lines of file and call cut_line() on each
248   loopfiles_lines(toys.optargs, cut_line);
249 }
250