1 /* cut.c - print selected ranges from a file
2 *
3 * Copyright 2016 Rob Landley <rob@landley.net>
4 *
5 * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
6 *
7 * Deviations from posix: added -DF. We can only accept 512 selections, and
8 * "-" counts as start to end. Using spaces to separate a comma-separated list
9 * is silly and inconsistent with dd, ps, cp, and mount.
10 *
11 * TODO: -s with -c
12
13 USE_CUT(NEWTOY(cut, "b*|c*|f*|F(regex-fields)*|C*|O(output-delimiter):d:sD(allow-duplicates)n[!cbfF]", TOYFLAG_USR|TOYFLAG_BIN))
14
15 config CUT
16 bool "cut"
17 default y
18 help
19 usage: cut [-Ds] [-bcCfF LIST] [-dO DELIM] [FILE...]
20
21 Print selected parts of lines from each FILE to standard output.
22
23 Each selection LIST is comma separated, either numbers (counting from 1)
24 or dash separated ranges (inclusive, with X- meaning to end of line and -X
25 from start). By default selection ranges are sorted and collated, use -D
26 to prevent that.
27
28 -b Select bytes (with -n round start/end down to start of utf8 char)
29 -c Select UTF-8 characters
30 -C Select unicode columns
31 -d Use DELIM (default is TAB for -f, run of whitespace for -F)
32 -D Don't sort/collate selections or match -fF lines without delimiter
33 -f Select fields (words) separated by single DELIM character
34 -F Select fields separated by DELIM regex
35 -O Output delimiter (default one space for -F, input delim for -f)
36 -s Skip lines without delimiters
37 */
38 #define FOR_cut
39 #include "toys.h"
40
41 GLOBALS(
42 char *d, *O;
43 struct arg_list *select[5]; // we treat them the same, so loop through
44
45 unsigned line;
46 int pairs;
47 regex_t reg;
48 )
49
50 // Apply selections to an input line, producing output
cut_line(char ** pline,long len)51 static void cut_line(char **pline, long len)
52 {
53 unsigned *pairs = (void *)toybuf, wc;
54 char *line;
55 int i, j, k;
56
57 if (!pline) return;
58 line = *pline;
59 if (len && line[len-1]=='\n') line[--len] = 0;
60 TT.line++;
61
62 // Loop through selections
63 for (i=0; i<TT.pairs; i++) {
64 unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
65 char *s = line, *ss, *sss;
66
67 // when the delimiter is \n output lines.
68 if (*TT.d == '\n') {
69 if (TT.line<start || TT.line>end) {
70 if (i+1 == TT.pairs) return;
71 continue;
72 }
73 goto write_line;
74 }
75
76 // input: start/end position, count=difference between them
77 // output: s = start of string, len = bytes to output
78
79 if (start) start--;
80 if (start>=len) continue;
81 if (!end || end>len) end = len;
82 count = end-start;
83
84 // Find start and end of output string for the relevant selection type
85 if (FLAG(b)) {
86 if (!FLAG(n)) s += start;
87 else {
88 if (end>len) end = len;
89 for (sss = ss = s; (k = (ss-line))<end;) {
90 if (0>(j = utf8towc(&wc, ss, len))) ss++;
91 else {
92 if (((ss += j)-line)<=end) sss = ss;
93 if ((ss-line)<=start) s = ss;
94 }
95 }
96 if (!(count = sss-s)) continue;
97 }
98 } else if (FLAG(C)) {
99 // crunch_str() currently assumes that combining characters get
100 // escaped, to provide an unambiguous visual representation.
101 // This assumes the input string is null terminated.
102 if (start) crunch_str(&s, start, 0, 0, 0);
103 if (!*s) continue;
104 start = s-line;
105 ss = s;
106 crunch_str(&ss, count, 0, 0, 0);
107 count = ss-s;
108
109 } else if (FLAG(c)) {
110
111 // Find start
112 ss = line+len;
113 while (start && s<ss) {
114 if (0<=(j = utf8towc(&wc, s, len))) start--;
115 s += (j<1) ? 1 : j;
116 }
117 if (s == ss) continue;
118
119 // Find end
120 end = count;
121 sss = s;
122 while (end && sss<ss) {
123 if (0<=(j = utf8towc(&wc, sss, len))) end--;
124 sss += (j<1) ? 1 : j;
125 }
126 count = sss-s;
127 } else {
128 regmatch_t match;
129
130 // Loop through skipping appropriate number of fields
131 for (j = 0; j<2; j++) {
132 ss = s;
133 if (j) start = count;
134 else end = start;
135 while (*ss && start) {
136 if (FLAG(f)) {
137 if (!strchr(TT.d, *ss++)) continue;
138 if (!--start && j) ss--;
139 } else {
140 if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
141 ss = line+len;
142 continue;
143 }
144 if (!match.rm_eo) break; // zero length match == no delimiter
145 ss += (!--start && j) ? match.rm_so : match.rm_eo;
146 }
147 }
148 if (!j && !*(s = ss)) break;
149 }
150
151 // If we never encountered even one separator, print whole line (posix!)
152 if (!j && end == start) {
153 if (FLAG(D)) break;
154 if (FLAG(s)) return;
155 write_line:
156 fwrite(line, len, 1, stdout);
157 break;
158 } else if (!*s) continue;
159 count = ss-s;
160 }
161 if (i && TT.O) fputs(TT.O, stdout);
162 #ifdef TOYBOX_OH_ADAPT
163 /* fix "cut -s -d, -f-3 A.txt" not filter problem*/
164 if (!FLAG(s) || strchr(s, *TT.d)) {
165 fwrite(s, count, 1, stdout);
166 }
167 #else
168 fwrite(s, count, 1, stdout);
169 #endif
170 }
171 xputc('\n');
172 }
173
compar(unsigned * a,unsigned * b)174 static int compar(unsigned *a, unsigned *b)
175 {
176 if (*a<*b) return -1;
177 if (*a>*b) return 1;
178 if (a[1]<b[1]) return -1;
179 if (a[1]>b[1]) return 1;
180
181 return 0;
182 }
183
184 // parse A or A-B or A- or -B
get_range(void * data,char * str,int len)185 static char *get_range(void *data, char *str, int len)
186 {
187 char *end = str;
188 unsigned *pairs = (void *)toybuf, i;
189
190 // Using toybuf[] to store ranges means we can have 512 selections max.
191 if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
192 pairs += 2*TT.pairs++;
193
194 pairs[1] = UINT_MAX;
195 for (i = 0; ;i++) {
196 if (i==2) return end;
197 if (isdigit(*end)) {
198 long long ll = estrtol(end, &end, 10);
199
200 if (ll<1 || ll>UINT_MAX || errno) return end;
201 pairs[i] = ll;
202 }
203 if (*end++ != '-') break;
204 }
205 if (!i) pairs[1] = pairs[0];
206 if ((end-str)<len) return end;
207 if (pairs[0]>pairs[1]) return str;
208
209 // No error
210 return 0;
211 }
212
cut_main(void)213 void cut_main(void)
214 {
215 int i;
216 char buf[8];
217
218 // Parse command line arguments
219 if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
220 error_exit("-s needs -Ff");
221 if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
222 error_exit("-d needs -Ff");
223 if (!TT.d) TT.d = (FLAG(F)) ? "[[:space:]][[:space:]]*" : "\t";
224 if (FLAG(F)) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
225 if (!TT.O) {
226 if (FLAG(F)) TT.O = " ";
227 else if (FLAG(f)) TT.O = TT.d;
228 }
229
230 // Parse ranges, which are attached to a selection type (only one can be set)
231 for (i = 0; i<ARRAY_LEN(TT.select); i++) {
232 sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
233 if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
234 }
235 if (!TT.pairs) error_exit("no selections");
236
237 // Sort and collate selections
238 if (!FLAG(D)) {
239 int from, to;
240 unsigned *pairs = (void *)toybuf;
241
242 qsort(toybuf, TT.pairs, 8, (void *)compar);
243 for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
244 if (pairs[from] > pairs[to+1]) {
245 to += 2;
246 memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
247 } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
248 }
249 TT.pairs = (to/2)+1;
250 }
251
252 // For each argument, loop through lines of file and call cut_line() on each
253 loopfiles_lines(toys.optargs, cut_line);
254 }
255