1 /* cut.c - print selected ranges from a file
2 *
3 * Copyright 2016 Rob Landley <rob@landley.net>
4 *
5 * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
6 *
7 * Deviations from posix: added -DF. We can only accept 512 selections, and
8 * "-" counts as start to end. Using spaces to separate a comma-separated list
9 * is silly and inconsistent with dd, ps, cp, and mount.
10 *
11 * todo: -n, -s with -c
12
13 USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN))
14
15 config CUT
16 bool "cut"
17 default y
18 help
19 usage: cut [-Ds] [-bcfF LIST] [-dO DELIM] [FILE...]
20
21 Print selected parts of lines from each FILE to standard output.
22
23 Each selection LIST is comma separated, either numbers (counting from 1)
24 or dash separated ranges (inclusive, with X- meaning to end of line and -X
25 from start). By default selection ranges are sorted and collated, use -D
26 to prevent that.
27
28 -b select bytes
29 -c select UTF-8 characters
30 -C select unicode columns
31 -d use DELIM (default is TAB for -f, run of whitespace for -F)
32 -D Don't sort/collate selections
33 -f select fields (words) separated by single DELIM character
34 -F select fields separated by DELIM regex
35 -O output delimiter (default one space for -F, input delim for -f)
36 -s skip lines without delimiters
37 */
38 #define FOR_cut
39 #include "toys.h"
40
GLOBALS(char * d;char * O;struct arg_list * select[5];int pairs;regex_t reg;)41 GLOBALS(
42 char *d;
43 char *O;
44 struct arg_list *select[5]; // we treat them the same, so loop through
45
46 int pairs;
47 regex_t reg;
48 )
49
50 // Return number of bytes to start of first column fitting in columns
51 // invalid sequences are skipped/ignored
52 int unicolumns(char *start, unsigned columns)
53 {
54 int i, j = 0;
55 wchar_t wc;
56 char *s = start, *ss = start;
57
58 // Skip start, rounding down if we hit a multicolumn char
59 while (j<columns && (i = utf8towc(&wc, s, 4))) {
60 if (i<0) s++;
61 else {
62 s += i;
63 if (0<(i = wcwidth(wc))) {
64 if ((j += i)>columns) break;
65 ss = s;
66 }
67 }
68 }
69
70 return ss-start;
71 }
72
73
74 // Apply selections to an input line, producing output
cut_line(char ** pline,long len)75 static void cut_line(char **pline, long len)
76 {
77 unsigned *pairs = (void *)toybuf;
78 char *line = *pline;
79 int i, j;
80
81 if (len && line[len-1]=='\n') line[--len] = 0;
82
83 // Loop through selections
84 for (i=0; i<TT.pairs; i++) {
85 unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
86 char *s = line, *ss;
87
88 // input: start/end position, count=difference between them
89 // output: s = start of string, len = bytes to output
90
91 if (start) start--;
92 if (start>=len) continue;
93 if (!end || end>len) end = len;
94 count = end-start;
95
96 // Find start and end of output string for the relevant selection type
97 if (toys.optflags&FLAG_b) s += start;
98 else if (toys.optflags&FLAG_C) {
99 // crunch_str() currently assumes that combining characters get
100 // escaped, to provide an unambiguous visual representation.
101 // This assumes the input string is null terminated.
102 //if (start) crunch_str(&s, start, 0, 0, 0);
103 //if (!*s) continue;
104 //start = s-line;
105 //ss = s;
106 //crunch_str(&ss, count, 0, 0, 0);
107 //count = ss-s;
108
109 s += unicolumns(s, start);
110 count = unicolumns(s, end-start);
111 } else if (toys.optflags&FLAG_c) {
112 wchar_t wc;
113 char *sss;
114
115 // Find start
116 ss = line+len;
117 while (start && s<ss) {
118 if (0<=(j = utf8towc(&wc, s, len))) start--;
119 s += (j<1) ? 1 : j;
120 }
121 if (s == ss) continue;
122
123 // Find end
124 end = count;
125 sss = s;
126 while (end && sss<ss) {
127 if (0<=(j = utf8towc(&wc, sss, len))) end--;
128 sss += (j<1) ? 1 : j;
129 }
130 count = sss-s;
131 } else {
132 regmatch_t match;
133
134 // Loop through skipping appropriate number of fields
135 for (j = 0; j<2; j++) {
136 ss = s;
137 if (j) start = count;
138 else end = start;
139 while (*ss && start) {
140 if (toys.optflags&FLAG_f) {
141 if (!strchr(TT.d, *ss++)) continue;
142 if (!--start && j) ss--;
143 } else {
144 if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
145 ss = line+len;
146 continue;
147 }
148 if (!match.rm_eo) break; // zero length match == no delimiter
149 ss += (!--start && j) ? match.rm_so : match.rm_eo;
150 }
151 }
152 if (!j && !*(s = ss)) break;
153 }
154
155 // If we never encountered even one separator, print whole line (posix!)
156 if (!j && end == start) {
157 if (toys.optflags&FLAG_s) return;
158 fwrite(line, len, 1, stdout);
159 break;
160 } else if (!*s) continue;
161 count = ss-s;
162 }
163 if (i && TT.O) fputs(TT.O, stdout);
164 fwrite(s, count, 1, stdout);
165 }
166 xputc('\n');
167 }
168
compar(unsigned * a,unsigned * b)169 static int compar(unsigned *a, unsigned *b)
170 {
171 if (*a<*b) return -1;
172 if (*a>*b) return 1;
173 if (a[1]<b[1]) return -1;
174 if (a[1]>b[1]) return 1;
175
176 return 0;
177 }
178
179 // parse A or A-B or A- or -B
get_range(void * data,char * str,int len)180 static char *get_range(void *data, char *str, int len)
181 {
182 char *end = str;
183 unsigned *pairs = (void *)toybuf, i;
184
185 // Using toybuf[] to store ranges means we can have 512 selections max.
186 if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
187 pairs += 2*TT.pairs++;
188
189 pairs[1] = UINT_MAX;
190 for (i = 0; ;i++) {
191 if (i==2) return end;
192 if (isdigit(*end)) {
193 long long ll = estrtol(end, &end, 10);
194
195 if (ll<1 || ll>UINT_MAX || errno) return end;
196 pairs[i] = ll;
197 }
198 if (*end++ != '-') break;
199 }
200 if (!i) pairs[1] = pairs[0];
201 if ((end-str)<len) return end;
202 if (pairs[0]>pairs[1]) return str;
203
204 // No error
205 return 0;
206 }
207
cut_main(void)208 void cut_main(void)
209 {
210 int i;
211 char buf[8];
212
213 // Parse command line arguments
214 if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
215 error_exit("-s needs -Ff");
216 if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
217 error_exit("-d needs -Ff");
218 if (!TT.d) TT.d = (toys.optflags&FLAG_F) ? "[[:space:]][[:space:]]*" : "\t";
219 if (toys.optflags&FLAG_F) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
220 if (!TT.O) {
221 if (toys.optflags&FLAG_F) TT.O = " ";
222 else if (toys.optflags&FLAG_f) TT.O = TT.d;
223 }
224
225 // Parse ranges, which are attached to a selection type (only one can be set)
226 for (i = 0; i<ARRAY_LEN(TT.select); i++) {
227 sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
228 if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
229 }
230 if (!TT.pairs) error_exit("no selections");
231
232 // Sort and collate selections
233 if (!(toys.optflags&FLAG_D)) {
234 int from, to;
235 unsigned *pairs = (void *)toybuf;
236
237 qsort(toybuf, TT.pairs, 8, (void *)compar);
238 for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
239 if (pairs[from] > pairs[to+1]) {
240 to += 2;
241 memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
242 } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
243 }
244 TT.pairs = (to/2)+1;
245 }
246
247 // For each argument, loop through lines of file and call cut_line() on each
248 loopfiles_lines(toys.optargs, cut_line);
249 }
250