1 /* cut.c - print selected ranges from a file
2 *
3 * Copyright 2016 Rob Landley <rob@landley.net>
4 *
5 * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
6 *
7 * Deviations from posix: added -DF. We can only accept 512 selections, and
8 * "-" counts as start to end. Using spaces to separate a comma-separated list
9 * is silly and inconsistent with dd, ps, cp, and mount.
10 *
11 * TODO: -s with -c
12
13 USE_CUT(NEWTOY(cut, "b*|c*|f*|F(regex-fields)*|C*|O(output-delimiter):d:sD(allow-duplicates)n[!cbfF]", TOYFLAG_USR|TOYFLAG_BIN))
14
15 config CUT
16 bool "cut"
17 default y
18 help
19 usage: cut [-Ds] [-bcCfF LIST] [-dO DELIM] [FILE...]
20
21 Print selected parts of lines from each FILE to standard output.
22
23 Each selection LIST is comma separated, either numbers (counting from 1)
24 or dash separated ranges (inclusive, with X- meaning to end of line and -X
25 from start). By default selection ranges are sorted and collated, use -D
26 to prevent that.
27
28 -b Select bytes (with -n round start/end down to start of utf8 char)
29 -c Select UTF-8 characters
30 -C Select unicode columns
31 -d Use DELIM (default is TAB for -f, run of whitespace for -F)
32 -D Don't sort/collate selections or match -fF lines without delimiter
33 -f Select fields (words) separated by single DELIM character
34 -F Select fields separated by DELIM regex
35 -O Output delimiter (default one space for -F, input delim for -f)
36 -s Skip lines without delimiters
37 */
38 #define FOR_cut
39 #include "toys.h"
40
41 GLOBALS(
42 char *d, *O;
43 struct arg_list *select[5]; // we treat them the same, so loop through
44
45 unsigned line;
46 int pairs;
47 regex_t reg;
48 )
49
50 // Apply selections to an input line, producing output
cut_line(char ** pline,long len)51 static void cut_line(char **pline, long len)
52 {
53 unsigned *pairs = (void *)toybuf, wc;
54 char *line;
55 int i, j, k;
56
57 if (!pline) return;
58 line = *pline;
59 if (len && line[len-1]=='\n') line[--len] = 0;
60 TT.line++;
61
62 // Loop through selections
63 for (i=0; i<TT.pairs; i++) {
64 unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
65 char *s = line, *ss, *sss;
66
67 // when the delimiter is \n output lines.
68 if (*TT.d == '\n') {
69 if (TT.line<start || TT.line>end) {
70 if (i+1 == TT.pairs) return;
71 continue;
72 }
73 goto write_line;
74 }
75
76 // input: start/end position, count=difference between them
77 // output: s = start of string, len = bytes to output
78
79 if (start) start--;
80 if (start>=len) continue;
81 if (!end || end>len) end = len;
82 count = end-start;
83
84 // Find start and end of output string for the relevant selection type
85 if (FLAG(b)) {
86 if (!FLAG(n)) s += start;
87 else {
88 if (end>len) end = len;
89 for (sss = ss = s; (k = (ss-line))<end;) {
90 if (0>(j = utf8towc(&wc, ss, len))) ss++;
91 else {
92 if (((ss += j)-line)<=end) sss = ss;
93 if ((ss-line)<=start) s = ss;
94 }
95 }
96 if (!(count = sss-s)) continue;
97 }
98 } else if (FLAG(C)) {
99 // crunch_str() currently assumes that combining characters get
100 // escaped, to provide an unambiguous visual representation.
101 // This assumes the input string is null terminated.
102 if (start) crunch_str(&s, start, 0, 0, 0);
103 if (!*s) continue;
104 start = s-line;
105 ss = s;
106 crunch_str(&ss, count, 0, 0, 0);
107 count = ss-s;
108
109 } else if (FLAG(c)) {
110
111 // Find start
112 ss = line+len;
113 while (start && s<ss) {
114 if (0<=(j = utf8towc(&wc, s, len))) start--;
115 s += (j<1) ? 1 : j;
116 }
117 if (s == ss) continue;
118
119 // Find end
120 end = count;
121 sss = s;
122 while (end && sss<ss) {
123 if (0<=(j = utf8towc(&wc, sss, len))) end--;
124 sss += (j<1) ? 1 : j;
125 }
126 count = sss-s;
127 } else {
128 regmatch_t match;
129
130 // Loop through skipping appropriate number of fields
131 for (j = 0; j<2; j++) {
132 ss = s;
133 if (j) start = count;
134 else end = start;
135 while (*ss && start) {
136 if (FLAG(f)) {
137 if (!strchr(TT.d, *ss++)) continue;
138 if (!--start && j) ss--;
139 } else {
140 if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
141 ss = line+len;
142 continue;
143 }
144 if (!match.rm_eo) break; // zero length match == no delimiter
145 ss += (!--start && j) ? match.rm_so : match.rm_eo;
146 }
147 }
148 if (!j && !*(s = ss)) break;
149 }
150
151 // If we never encountered even one separator, print whole line (posix!)
152 if (!j && end == start) {
153 if (FLAG(D)) break;
154 if (FLAG(s)) return;
155 write_line:
156 fwrite(line, len, 1, stdout);
157 break;
158 } else if (!*s) continue;
159 count = ss-s;
160 }
161 if (i && TT.O) fputs(TT.O, stdout);
162 fwrite(s, count, 1, stdout);
163 }
164 xputc('\n');
165 }
166
compar(unsigned * a,unsigned * b)167 static int compar(unsigned *a, unsigned *b)
168 {
169 if (*a<*b) return -1;
170 if (*a>*b) return 1;
171 if (a[1]<b[1]) return -1;
172 if (a[1]>b[1]) return 1;
173
174 return 0;
175 }
176
177 // parse A or A-B or A- or -B
get_range(void * data,char * str,int len)178 static char *get_range(void *data, char *str, int len)
179 {
180 char *end = str;
181 unsigned *pairs = (void *)toybuf, i;
182
183 // Using toybuf[] to store ranges means we can have 512 selections max.
184 if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
185 pairs += 2*TT.pairs++;
186
187 pairs[1] = UINT_MAX;
188 for (i = 0; ;i++) {
189 if (i==2) return end;
190 if (isdigit(*end)) {
191 long long ll = estrtol(end, &end, 10);
192
193 if (ll<1 || ll>UINT_MAX || errno) return end;
194 pairs[i] = ll;
195 }
196 if (*end++ != '-') break;
197 }
198 if (!i) pairs[1] = pairs[0];
199 if ((end-str)<len) return end;
200 if (pairs[0]>pairs[1]) return str;
201
202 // No error
203 return 0;
204 }
205
cut_main(void)206 void cut_main(void)
207 {
208 int i;
209 char buf[8];
210
211 // Parse command line arguments
212 if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
213 error_exit("-s needs -Ff");
214 if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
215 error_exit("-d needs -Ff");
216 if (!TT.d) TT.d = (FLAG(F)) ? "[[:space:]][[:space:]]*" : "\t";
217 if (FLAG(F)) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
218 if (!TT.O) {
219 if (FLAG(F)) TT.O = " ";
220 else if (FLAG(f)) TT.O = TT.d;
221 }
222
223 // Parse ranges, which are attached to a selection type (only one can be set)
224 for (i = 0; i<ARRAY_LEN(TT.select); i++) {
225 sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
226 if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
227 }
228 if (!TT.pairs) error_exit("no selections");
229
230 // Sort and collate selections
231 if (!FLAG(D)) {
232 int from, to;
233 unsigned *pairs = (void *)toybuf;
234
235 qsort(toybuf, TT.pairs, 8, (void *)compar);
236 for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
237 if (pairs[from] > pairs[to+1]) {
238 to += 2;
239 memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
240 } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
241 }
242 TT.pairs = (to/2)+1;
243 }
244
245 // For each argument, loop through lines of file and call cut_line() on each
246 loopfiles_lines(toys.optargs, cut_line);
247 }
248