• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* awk.c - An awk implementation.
2  * vi: tabstop=2 softtabstop=2 shiftwidth=2
3  *
4  * Copyright 2024 Ray Gardner <raygard@gmail.com>
5  *
6  * See https://pubs.opengroup.org/onlinepubs/9699919799/utilities/awk.html
7 
8 USE_AWK(NEWTOY(awk, "F:v*f*bc", TOYFLAG_USR|TOYFLAG_BIN))
9 
10 config AWK
11   bool "awk"
12   default n
13   help
14     usage:  awk [-F sepstring] [-v assignment]... program [argument...]
15       or:
16             awk [-F sepstring] -f progfile [-f progfile]... [-v assignment]...
17                   [argument...]
18       also:
19       -b : use bytes, not characters
20       -c : compile only, do not run
21 */
22 
23 #define FOR_awk
24 #include "toys.h"
25 
26 GLOBALS(
27   struct arg_list *f;
28   struct arg_list *v;
29   char *F;
30 
31   struct scanner_state {
32       char *p;
33       char *progstring;
34       struct arg_list *prog_args;
35       char *filename;
36       char *line;
37       size_t line_size;
38       ssize_t line_len;
39       int line_num;
40       int ch;
41       FILE *fp;
42       // state includes latest token seen
43       int tok;
44       int tokbuiltin;
45       int toktype;
46       char *tokstr;
47       size_t maxtok;
48       size_t toklen;
49       double numval;
50       int error;  // Set if lexical error.
51   } *scs;
52   char *tokstr;
53   int prevtok;
54 
55   struct compiler_globals {
56     int in_print_stmt;
57     int paren_level;
58     int in_function_body;
59     int funcnum;
60     int nparms;
61     int compile_error_count;
62     int first_begin;
63     int last_begin;
64     int first_end;
65     int last_end;
66     int first_recrule;
67     int last_recrule;
68     int break_dest;
69     int continue_dest;
70     int stack_offset_to_fix;  // fixup stack if return in for(e in a)
71     int range_pattern_num;
72     int rule_type;  // tkbegin, tkend, or 0
73   } cgl;
74 
75   // zvalue: the main awk value type
76   // Can be number or string or both, or else map (array) or regex
77   struct zvalue {
78     unsigned flags;
79     double num;
80     union { // anonymous union not in C99; not going to fix it now.
81       struct zstring *vst;
82       struct zmap *map;
83       regex_t *rx;
84     };
85   } nozvalue;   // to shut up compiler warning TODO FIXME
86 
87   struct runtime_globals {
88     struct zvalue cur_arg;
89     //char *filename;     // UNUSED
90     FILE *fp;           // current data file
91     int narg;           // cmdline arg index
92     int nfiles;         // num of cmdline data file args processed
93     int eof;            // all cmdline files (incl. stdin) read
94     char *recptr;
95     char *recbuf;
96     size_t recbufsize;
97     char *recbuf_multx;
98     size_t recbufsize_multx;
99     struct zstring *zspr;      // Global to receive sprintf() string value
100   } rgl;
101 
102   // Expanding sequential list
103   struct zlist {
104     char *base, *limit, *avail;
105     size_t size;
106   } globals_table,  // global symbol table
107     locals_table,     // local symbol table
108     func_def_table;  // function symbol table
109   // runtime lists
110   struct zlist literals, fields, zcode, stack;
111 
112   char *progname;
113 
114   int spec_var_limit;
115   int zcode_last;
116   struct zvalue *stackp;  // top of stack ptr
117 
118   char *pbuf;   // Used for number formatting in num_to_zstring()
119 #define RS_MAX  64
120   char rs_last[RS_MAX];
121   regex_t rx_rs_default, rx_rs_last;
122   regex_t rx_default, rx_last, rx_printf_fmt;
123 #define FS_MAX  64
124   char fs_last[FS_MAX];
125   char one_char_fs[4];
126   int nf_internal;  // should match NF
127   char range_sw[64];   // FIXME TODO quick and dirty set of range switches
128   int file_cnt, std_file_cnt;
129 
130   struct zfile {
131     struct zfile *next;
132     char *fn;
133     FILE *fp;
134     char mode;  // w, a, or r
135     char file_or_pipe;  // f or p
136     char is_std_file;
137     char *recbuf;
138     size_t recbufsize;
139     char *recbuf_multi;
140     size_t recbufsize_multi;
141     char *recbuf_multx;
142     size_t recbufsize_multx;
143     int recoffs, endoffs;
144   } *zfiles, *cfile, *zstdout;
145 )
146 
147 #ifdef __GNUC__
148 #define ATTR_FALLTHROUGH_INTENDED __attribute__ ((fallthrough))
149 #else
150 #define ATTR_FALLTHROUGH_INTENDED
151 #endif
152 
153 ////////////////////
154 ////   declarations
155 ////////////////////
156 
157 #define PBUFSIZE  512 // For num_to_zstring()
158 
159 enum toktypes {
160     // EOF (use -1 from stdio.h)
161     ERROR = 2, NEWLINE, VAR, NUMBER, STRING, REGEX, USERFUNC, BUILTIN, TOKEN,
162     KEYWORD
163     };
164 
165 // Must align with lbp_table[]
166 enum tokens {
167     tkunusedtoken, tkeof, tkerr, tknl,
168     tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin,
169 
170 // static char *ops = " ;  ,  [  ]  (  )  {  }  $  ++ -- ^  !  *  /  %  +  -     "
171 //        "<  <= != == >  >= ~  !~ && || ?  :  ^= %= *= /= += -= =  >> |  ";
172     tksemi, tkcomma, tklbracket, tkrbracket, tklparen, tkrparen, tklbrace,
173     tkrbrace, tkfield, tkincr, tkdecr, tkpow, tknot, tkmul, tkdiv, tkmod,
174     tkplus, tkminus,
175     tkcat, // !!! Fake operator for concatenation (just adjacent string exprs)
176     tklt, tkle, tkne, tkeq, tkgt, tkge, tkmatchop, tknotmatch, tkand, tkor,
177     tkternif, tkternelse, tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn,
178     tkaddasgn, tksubasgn, tkasgn, tkappend, tkpipe,
179 
180 // static char *keywords = " in        BEGIN     END       if        else      "
181 //    "while     for       do        break     continue  exit      function  "
182 //    "return    next      nextfile  delete    print     printf    getline   ";
183     tkin, tkbegin, tkend, tkif, tkelse,
184     tkwhile, tkfor, tkdo, tkbreak, tkcontinue, tkexit, tkfunction,
185     tkreturn, tknext, tknextfile, tkdelete, tkprint, tkprintf, tkgetline,
186 
187 // static char *builtins = " atan2     cos       sin       exp       "
188 //    "log       sqrt      int       rand      srand     length    "
189 //    "tolower   toupper   system    fflush    "
190 //    "and       or        xor       lshift    rshift    ";
191     tkatan2, tkcos, tksin, tkexp, tklog, tksqrt, tkint, tkrand, tksrand,
192     tklength, tktolower, tktoupper, tksystem, tkfflush,
193     tkband, tkbor, tkbxor, tklshift, tkrshift,
194 
195 // static char *specialfuncs = " close     index     match     split     "
196 //    "sub       gsub      sprintf   substr    ";
197     tkclose, tkindex, tkmatch, tksplit,
198     tksub, tkgsub, tksprintf, tksubstr, tklasttk
199     };
200 
201 enum opcodes {
202     opunusedop = tklasttk,
203     opvarref, opmapref, opfldref, oppush, opdrop, opdrop_n, opnotnot,
204     oppreincr, oppredecr, oppostincr, oppostdecr, opnegate, opjump, opjumptrue,
205     opjumpfalse, opprepcall, opmap, opmapiternext, opmapdelete, opmatchrec,
206     opquit, opprintrec, oprange1, oprange2, oprange3, oplastop
207 };
208 
209 // Special variables (POSIX). Must align with char *spec_vars[]
210 enum spec_var_names { ARGC=1, ARGV, CONVFMT, ENVIRON, FILENAME, FNR, FS, NF,
211     NR, OFMT, OFS, ORS, RLENGTH, RS, RSTART, SUBSEP };
212 
213 struct symtab_slot {    // global symbol table entry
214   unsigned flags;
215   int slotnum;
216   char *name;
217 };
218 
219 // zstring: flexible string type.
220 // Capacity must be > size because we insert a NUL byte.
221 struct zstring {
222   int refcnt;
223   unsigned size;
224   unsigned capacity;
225   char str[];   // C99 flexible array member
226 };
227 
228 // Flag bits for zvalue and symbol tables
229 #define ZF_MAYBEMAP (1u << 1)
230 #define ZF_MAP      (1u << 2)
231 #define ZF_SCALAR   (1u << 3)
232 #define ZF_NUM      (1u << 4)
233 #define ZF_RX       (1u << 5)
234 #define ZF_STR      (1u << 6)
235 #define ZF_NUMSTR   (1u << 7)   // "numeric string" per posix
236 #define ZF_REF      (1u << 9)   // for lvalues
237 #define ZF_MAPREF   (1u << 10)  // for lvalues
238 #define ZF_FIELDREF (1u << 11)  // for lvalues
239 #define ZF_EMPTY_RX (1u << 12)
240 #define ZF_ANYMAP   (ZF_MAP | ZF_MAYBEMAP)
241 
242 // Macro to help facilitate possible future change in zvalue layout.
243 #define ZVINIT(flags, num, ptr) {(flags), (double)(num), {(ptr)}}
244 
245 #define IS_STR(zvalp) ((zvalp)->flags & ZF_STR)
246 #define IS_RX(zvalp) ((zvalp)->flags & ZF_RX)
247 #define IS_NUM(zvalp) ((zvalp)->flags & ZF_NUM)
248 #define IS_MAP(zvalp) ((zvalp)->flags & ZF_MAP)
249 #define IS_EMPTY_RX(zvalp) ((zvalp)->flags & ZF_EMPTY_RX)
250 
251 #define GLOBAL      ((struct symtab_slot *)TT.globals_table.base)
252 #define LOCAL       ((struct symtab_slot *)TT.locals_table.base)
253 #define FUNC_DEF    ((struct functab_slot *)TT.func_def_table.base)
254 
255 #define LITERAL     ((struct zvalue *)TT.literals.base)
256 #define STACK       ((struct zvalue *)TT.stack.base)
257 #define FIELD       ((struct zvalue *)TT.fields.base)
258 
259 #define ZCODE       ((int *)TT.zcode.base)
260 
261 #define FUNC_DEFINED    (1u)
262 #define FUNC_CALLED     (2u)
263 
264 #define MIN_STACK_LEFT 1024
265 
266 struct functab_slot {    // function symbol table entry
267   unsigned flags;
268   int slotnum;
269   char *name;
270   struct zlist function_locals;
271   int zcode_addr;
272 };
273 
274 // Elements of the hash table (key/value pairs)
275 struct zmap_slot {
276   int hash;       // store hash key to speed hash table expansion
277   struct zstring *key;
278   struct zvalue val;
279 };
280 #define ZMSLOTINIT(hash, key, val) {hash, key, val}
281 
282 // zmap: Mapping data type for arrays; a hash table. Values in hash are either
283 // 0 (unused), -1 (marked deleted), or one plus the number of the zmap slot
284 // containing a key/value pair. The zlist slot entries are numbered from 0 to
285 // count-1, so need to add one to distinguish from unused.  The probe sequence
286 // is borrowed from Python dict, using the "perturb" idea to mix in upper bits
287 // of the original hash value.
288 struct zmap {
289   unsigned mask;  // tablesize - 1; tablesize is 2 ** n
290   int *hash;      // (mask + 1) elements
291   int limit;      // 80% of table size ((mask+1)*8/10)
292   int count;      // number of occupied slots in hash
293   int deleted;    // number of deleted slots
294   struct zlist slot;     // expanding list of zmap_slot elements
295 };
296 
297 #define MAPSLOT    ((struct zmap_slot *)(m->slot).base)
298 #define FFATAL(format, ...) zzerr("$" format, __VA_ARGS__)
299 #define FATAL(...) zzerr("$%s\n", __VA_ARGS__)
300 #define XERR(format, ...) zzerr(format, __VA_ARGS__)
301 
302 #define NO_EXIT_STATUS  (9999987)  // value unlikely to appear in exit stmt
303 
304 ssize_t getline(char **lineptr, size_t *n, FILE *stream);
305 ssize_t getdelim(char ** restrict lineptr, size_t * restrict n, int delimiter, FILE *stream);
306 
307 
308 
309 ////////////////////
310 //// lib
311 ////////////////////
312 
xfree(void * p)313 static void xfree(void *p)
314 {
315   free(p);
316 }
317 
hexval(int c)318 static int hexval(int c)
319 {
320   // Assumes c is valid hex digit
321   return isdigit(c) ? c - '0' : (c | 040) - 'a' + 10;
322 }
323 
324 ////////////////////
325 //// common defs
326 ////////////////////
327 
328 // These (ops, keywords, builtins) must align with enum tokens
329 static char *ops = " ;  ,  [  ]  (  )  {  }  $  ++ -- ^  !  *  /  %  +  -  .. "
330         "<  <= != == >  >= ~  !~ && || ?  :  ^= %= *= /= += -= =  >> |  ";
331 
332 static char *keywords = " in        BEGIN     END       if        else      "
333     "while     for       do        break     continue  exit      function  "
334     "return    next      nextfile  delete    print     printf    getline   ";
335 
336 static char *builtins = " atan2     cos       sin       exp       log       "
337     "sqrt      int       rand      srand     length    "
338     "tolower   toupper   system    fflush    "
339     "and       or        xor       lshift    rshift    "
340     "close     index     match     split     "
341     "sub       gsub      sprintf   substr    ";
342 
zzerr(char * format,...)343 static void zzerr(char *format, ...)
344 {
345   va_list args;
346   int fatal_sw = 0;
347   fprintf(stderr, "%s: ", TT.progname);
348   if (format[0] == '$') {
349     fprintf(stderr, "FATAL: ");
350     format++;
351     fatal_sw = 1;
352   }
353   fprintf(stderr, "file %s line %d: ", TT.scs->filename, TT.scs->line_num);
354   va_start(args, format);
355   vfprintf(stderr, format, args);
356   va_end(args);
357   if (format[strlen(format)-1] != '\n') fputc('\n', stderr); // TEMP FIXME !!!
358   fflush(stderr);
359   if (fatal_sw) exit(2);
360         // Don't bump error count for warnings
361   else if (!strstr(format, "arning")) TT.cgl.compile_error_count++;
362 }
363 
get_token_text(char * op,int tk)364 static void get_token_text(char *op, int tk)
365 {
366   // This MUST ? be changed if ops string or tk... assignments change!
367   memmove(op, ops + 3 * (tk - tksemi) + 1, 2);
368   op[ op[1] == ' ' ? 1 : 2 ] = 0;
369 }
370 
371 ////////////////////
372 /// UTF-8
373 ////////////////////
374 
375 // Return number of bytes in 'cnt' utf8 codepoints
bytesinutf8(char * str,size_t len,size_t cnt)376 static int bytesinutf8(char *str, size_t len, size_t cnt)
377 {
378   if (FLAG(b)) return cnt;
379   unsigned wch;
380   char *lim = str + len, *s0 = str;
381   while (cnt-- && str < lim) {
382     int r = utf8towc(&wch, str, lim - str);
383     str += r > 0 ? r : 1;
384   }
385   return str - s0;
386 }
387 
388 // Return number of utf8 codepoints in str
utf8cnt(char * str,size_t len)389 static int utf8cnt(char *str, size_t len)
390 {
391   unsigned wch;
392   int cnt = 0;
393   char *lim;
394   if (!len || FLAG(b)) return len;
395   for (lim = str + len; str < lim; cnt++) {
396     int r = utf8towc(&wch, str, lim - str);
397     str += r > 0 ? r : 1;
398   }
399   return cnt;
400 }
401 
402 ////////////////////
403 ////   zlist
404 ////////////////////
405 
zlist_initx(struct zlist * p,size_t size,size_t count)406 static struct zlist *zlist_initx(struct zlist *p, size_t size, size_t count)
407 {
408   p->base = p->avail = xzalloc(count * size);
409   p->limit = p->base + size * count;
410   p->size = size;
411   return p;
412 }
413 
zlist_init(struct zlist * p,size_t size)414 static struct zlist *zlist_init(struct zlist *p, size_t size)
415 {
416 #define SLIST_MAX_INIT_BYTES 128
417   return zlist_initx(p, size, SLIST_MAX_INIT_BYTES / size);
418 }
419 
420 // This is called from zlist_append() and add_stack() in run
zlist_expand(struct zlist * p)421 static void zlist_expand(struct zlist *p)
422 {
423   size_t offset = p->avail - p->base;
424   size_t cap = p->limit - p->base;
425   size_t newcap = maxof(cap + p->size, ((cap / p->size) * 3 / 2) * p->size);
426   if (newcap <= cap) error_exit("mem req error");
427   char *base = xrealloc(p->base, newcap);
428   p->base = base;
429   p->limit = base + newcap;
430   p->avail = base + offset;
431 }
432 
zlist_append(struct zlist * p,void * obj)433 static size_t zlist_append(struct zlist *p, void *obj)
434 {
435   // Insert obj (p->size bytes) at end of list, expand as needed.
436   // Return scaled offset to newly inserted obj; i.e. the
437   // "slot number" 0, 1, 2,...
438   void *objtemp = 0;
439   if (p->avail > p->limit - p->size) {
440     objtemp = xmalloc(p->size);     // Copy obj in case it is in
441     memmove(objtemp, obj, p->size); // the area realloc might free!
442     obj = objtemp;
443     zlist_expand(p);
444   }
445   memmove(p->avail, obj, p->size);
446   if (objtemp) xfree(objtemp);
447   p->avail += p->size;
448   return (p->avail - p->base - p->size) / p->size;  // offset of updated slot
449 }
450 
zlist_len(struct zlist * p)451 static int zlist_len(struct zlist *p)
452 {
453   return (p->avail - p->base) / p->size;
454 }
455 
456 ////////////////////
457 ////   zstring
458 ////////////////////
459 
zstring_release(struct zstring ** s)460 static void zstring_release(struct zstring **s)
461 {
462   if (*s && (**s).refcnt-- == 0) xfree(*s); //free_zstring(s);
463   *s = 0;
464 }
465 
zstring_incr_refcnt(struct zstring * s)466 static void zstring_incr_refcnt(struct zstring *s)
467 {
468   if (s) s->refcnt++;
469 }
470 
471 // !! Use only if 'to' is NULL or its refcnt is 0.
zstring_modify(struct zstring * to,size_t at,char * s,size_t n)472 static struct zstring *zstring_modify(struct zstring *to, size_t at, char *s, size_t n)
473 {
474   size_t cap = at + n + 1;
475   if (!to || to->capacity < cap) {
476     to = xrealloc(to, sizeof(*to) + cap);
477     to->capacity = cap;
478     to->refcnt = 0;
479   }
480   memcpy(to->str + at, s, n);
481   to->size = at + n;
482   to->str[to->size] = '\0';
483   return to;
484 }
485 
486 // The 'to' pointer may move by realloc, so return (maybe updated) pointer.
487 // If refcnt is nonzero then there is another pointer to this zstring,
488 // so copy this one and release it. If refcnt is zero we can mutate this.
zstring_update(struct zstring * to,size_t at,char * s,size_t n)489 static struct zstring *zstring_update(struct zstring *to, size_t at, char *s, size_t n)
490 {
491   if (to && to->refcnt) {
492     struct zstring *to_before = to;
493     to = zstring_modify(0, 0, to->str, to->size);
494     zstring_release(&to_before);
495   }
496   return zstring_modify(to, at, s, n);
497 }
498 
zstring_copy(struct zstring * to,struct zstring * from)499 static struct zstring *zstring_copy(struct zstring *to, struct zstring *from)
500 {
501   return zstring_update(to, 0, from->str, from->size);
502 }
503 
zstring_extend(struct zstring * to,struct zstring * from)504 static struct zstring *zstring_extend(struct zstring *to, struct zstring *from)
505 {
506   return zstring_update(to, to->size, from->str, from->size);
507 }
508 
new_zstring(char * s,size_t size)509 static struct zstring *new_zstring(char *s, size_t size)
510 {
511   return zstring_modify(0, 0, s, size);
512 }
513 
514 ////////////////////
515 ////   zvalue
516 ////////////////////
517 
518 static struct zvalue uninit_zvalue = ZVINIT(0, 0.0, 0);
519 
520 // This will be reassigned in init_globals() with an empty string.
521 // It's a special value used for "uninitialized" field vars
522 // referenced past $NF. See push_field().
523 static struct zvalue uninit_string_zvalue = ZVINIT(0, 0.0, 0);
524 
new_str_val(char * s)525 static struct zvalue new_str_val(char *s)
526 {
527   // Only if no nul inside string!
528   struct zvalue v = ZVINIT(ZF_STR, 0.0, new_zstring(s, strlen(s)));
529   return v;
530 }
531 
zvalue_release_zstring(struct zvalue * v)532 static void zvalue_release_zstring(struct zvalue *v)
533 {
534   if (v && ! (v->flags & (ZF_ANYMAP | ZF_RX))) zstring_release(&v->vst);
535 }
536 
537 // push_val() is used for initializing globals (see init_compiler())
538 // but mostly used in runtime
539 // WARNING: push_val may change location of v, so do NOT depend on it after!
540 // Note the incr refcnt used to be after the zlist_append, but that caused a
541 // heap-use-after-free error when the zlist_append relocated the zvalue being
542 // pushed, invalidating the v pointer.
push_val(struct zvalue * v)543 static void push_val(struct zvalue *v)
544 {
545   if (IS_STR(v) && v->vst) v->vst->refcnt++;  // inlined zstring_incr_refcnt()
546   *++TT.stackp = *v;
547 }
548 
zvalue_copy(struct zvalue * to,struct zvalue * from)549 static void zvalue_copy(struct zvalue *to, struct zvalue *from)
550 {
551   if (IS_RX(from)) *to = *from;
552   else {
553     zvalue_release_zstring(to);
554     *to = *from;
555     zstring_incr_refcnt(to->vst);
556   }
557 }
558 
zvalue_dup_zstring(struct zvalue * v)559 static void zvalue_dup_zstring(struct zvalue *v)
560 {
561   struct zstring *z = new_zstring(v->vst->str, v->vst->size);
562   zstring_release(&v->vst);
563   v->vst = z;
564 }
565 
566 ////////////////////
567 ////   zmap (array) implementation
568 ////////////////////
569 
zstring_match(struct zstring * a,struct zstring * b)570 static int zstring_match(struct zstring *a, struct zstring *b)
571 {
572   return a->size == b->size && memcmp(a->str, b->str, a->size) == 0;
573 }
574 
zstring_hash(struct zstring * s)575 static int zstring_hash(struct zstring *s)
576 {   // djb2 -- small, fast, good enough for this
577   unsigned h = 5381;
578   char *p = s->str, *lim = p + s->size;
579   while (p < lim)
580     h = (h << 5) + h + *p++;
581   return h;
582 }
583 
584 enum { PSHIFT = 5 };  // "perturb" shift -- see find_mapslot() below
585 
find_mapslot(struct zmap * m,struct zstring * key,int * hash,int * probe)586 static struct zmap_slot *find_mapslot(struct zmap *m, struct zstring *key, int *hash, int *probe)
587 {
588   struct zmap_slot *x = 0;
589   unsigned perturb = *hash = zstring_hash(key);
590   *probe = *hash & m->mask;
591   int n, first_deleted = -1;
592   while ((n = m->hash[*probe])) {
593     if (n > 0) {
594       x = &MAPSLOT[n-1];
595       if (*hash == x->hash && zstring_match(key, x->key)) {
596         return x;
597       }
598     } else if (first_deleted < 0) first_deleted = *probe;
599     // Based on technique in Python dict implementation. Comment there
600     // (https://github.com/python/cpython/blob/3.10/Objects/dictobject.c)
601     // says
602     //
603     // j = ((5*j) + 1) mod 2**i
604     // For any initial j in range(2**i), repeating that 2**i times generates
605     // each int in range(2**i) exactly once (see any text on random-number
606     // generation for proof).
607     //
608     // The addition of 'perturb' greatly improves the probe sequence. See
609     // the Python dict implementation for more details.
610     *probe = (*probe * 5 + 1 + (perturb >>= PSHIFT)) & m->mask;
611   }
612   if (first_deleted >= 0) *probe = first_deleted;
613   return 0;
614 }
615 
zmap_find(struct zmap * m,struct zstring * key)616 static struct zvalue *zmap_find(struct zmap *m, struct zstring *key)
617 {
618   int hash, probe;
619   struct zmap_slot *x = find_mapslot(m, key, &hash, &probe);
620   return x ? &x->val : 0;
621 }
622 
zmap_init(struct zmap * m)623 static void zmap_init(struct zmap *m)
624 {
625   enum {INIT_SIZE = 8};
626   m->mask = INIT_SIZE - 1;
627   m->hash = xzalloc(INIT_SIZE * sizeof(*m->hash));
628   m->limit = INIT_SIZE * 8 / 10;
629   m->count = 0;
630   m->deleted = 0;
631   zlist_init(&m->slot, sizeof(struct zmap_slot));
632 }
633 
zvalue_map_init(struct zvalue * v)634 static void zvalue_map_init(struct zvalue *v)
635 {
636   struct zmap *m = xmalloc(sizeof(*m));
637   zmap_init(m);
638   v->map = m;
639   v->flags |= ZF_MAP;
640 }
641 
zmap_delete_map_incl_slotdata(struct zmap * m)642 static void zmap_delete_map_incl_slotdata(struct zmap *m)
643 {
644   for (struct zmap_slot *p = &MAPSLOT[0]; p < &MAPSLOT[zlist_len(&m->slot)]; p++) {
645     if (p->key) zstring_release(&p->key);
646     if (p->val.vst) zstring_release(&p->val.vst);
647   }
648   xfree(m->slot.base);
649   xfree(m->hash);
650 }
651 
zmap_delete_map(struct zmap * m)652 static void zmap_delete_map(struct zmap *m)
653 {
654   zmap_delete_map_incl_slotdata(m);
655   zmap_init(m);
656 }
657 
zmap_rehash(struct zmap * m)658 static void zmap_rehash(struct zmap *m)
659 {
660   // New table is twice the size of old.
661   int size = m->mask + 1;
662   unsigned mask = 2 * size - 1;
663   int *h = xzalloc(2 * size * sizeof(*m->hash));
664   // Step through the old hash table, set up location in new table.
665   for (int i = 0; i < size; i++) {
666     int n = m->hash[i];
667     if (n > 0) {
668       int hash = MAPSLOT[n-1].hash;
669       unsigned perturb = hash;
670       int p = hash & mask;
671       while (h[p]) {
672         p = (p * 5 + 1 + (perturb >>= PSHIFT)) & mask;
673       }
674       h[p] = n;
675     }
676   }
677   m->mask = mask;
678   xfree(m->hash);
679   m->hash = h;
680   m->limit = 2 * size * 8 / 10;
681 }
682 
zmap_find_or_insert_key(struct zmap * m,struct zstring * key)683 static struct zmap_slot *zmap_find_or_insert_key(struct zmap *m, struct zstring *key)
684 {
685   int hash, probe;
686   struct zmap_slot *x = find_mapslot(m, key, &hash, &probe);
687   if (x) return x;
688   // not found; insert it.
689   if (m->count == m->limit) {
690     zmap_rehash(m);         // rehash if getting too full.
691     // rerun find_mapslot to get new probe index
692     x = find_mapslot(m, key, &hash, &probe);
693   }
694   // Assign key to new slot entry and bump refcnt.
695   struct zmap_slot zs = ZMSLOTINIT(hash, key, (struct zvalue)ZVINIT(0, 0.0, 0));
696   zstring_incr_refcnt(key);
697   int n = zlist_append(&m->slot, &zs);
698   m->count++;
699   m->hash[probe] = n + 1;
700   return &MAPSLOT[n];
701 }
702 
zmap_delete(struct zmap * m,struct zstring * key)703 static void zmap_delete(struct zmap *m, struct zstring *key)
704 {
705   int hash, probe;
706   struct zmap_slot *x = find_mapslot(m, key, &hash, &probe);
707   if (!x) return;
708   zstring_release(&MAPSLOT[m->hash[probe] - 1].key);
709   m->hash[probe] = -1;
710   m->deleted++;
711 }
712 
713 ////////////////////
714 //// scan (lexical analyzer)
715 ////////////////////
716 
717 // TODO:
718 // IS line_num getting incr correctly? Newline counts as start of line!?
719 // Handle nuls in file better.
720 // Open files "rb" and handle CRs in program.
721 // Roll gch() into get_char() ?
722 // Deal with signed char (at EOF? elsewhere?)
723 //
724 // 2023-01-11: Allow nul bytes inside strings? regexes?
725 
progfile_open(void)726 static void progfile_open(void)
727 {
728   TT.scs->filename = TT.scs->prog_args->arg;
729   TT.scs->prog_args = TT.scs->prog_args->next;
730   TT.scs->fp = stdin;
731   if (strcmp(TT.scs->filename, "-")) TT.scs->fp = fopen(TT.scs->filename, "r");
732   if (!TT.scs->fp) error_exit("Can't open %s", TT.scs->filename);
733   TT.scs->line_num = 0;
734 }
735 
get_char(void)736 static int get_char(void)
737 {
738   static char *nl = "\n";
739   // On first entry, TT.scs->p points to progstring if any, or null string.
740   for (;;) {
741     int c = *(TT.scs->p)++;
742     if (c) {
743       return c;
744     }
745     if (TT.scs->progstring) {  // Fake newline at end of progstring.
746       if (TT.scs->progstring == nl) return EOF;
747       TT.scs->p = TT.scs->progstring = nl;
748       continue;
749     }
750     // Here if getting from progfile(s).
751     if (TT.scs->line == nl) return EOF;
752     if (!TT.scs->fp) {
753       progfile_open();
754     // The "  " + 1 is to set p to null string but allow ref to prev char for
755     // "lastchar" test below.
756     }
757     // Save last char to allow faking final newline.
758     int lastchar = (TT.scs->p)[-2];
759     TT.scs->line_len = getline(&TT.scs->line, &TT.scs->line_size, TT.scs->fp);
760     if (TT.scs->line_len > 0) {
761       TT.scs->line_num++;
762       TT.scs->p = TT.scs->line;
763       continue;
764     }
765     // EOF
766     // FIXME TODO or check for error? feof() vs. ferror()
767     fclose(TT.scs->fp);
768     TT.scs->fp = 0;
769     TT.scs->p = "  " + 2;
770     if (!TT.scs->prog_args) {
771       xfree(TT.scs->line);
772       if (lastchar == '\n') return EOF;
773       // Fake final newline
774       TT.scs->line = TT.scs->p = nl;
775     }
776   }
777 }
778 
append_this_char(int c)779 static void append_this_char(int c)
780 {
781   if (TT.scs->toklen == TT.scs->maxtok - 1) {
782     TT.scs->maxtok *= 2;
783     TT.scs->tokstr = xrealloc(TT.scs->tokstr, TT.scs->maxtok);
784   }
785   TT.scs->tokstr[TT.scs->toklen++] = c;
786   TT.scs->tokstr[TT.scs->toklen] = 0;
787 }
788 
gch(void)789 static void gch(void)
790 {
791   // FIXME probably not right place to skip CRs.
792   do {
793     TT.scs->ch = get_char();
794   } while (TT.scs->ch == '\r');
795 }
796 
append_char(void)797 static void append_char(void)
798 {
799   append_this_char(TT.scs->ch);
800   gch();
801 }
802 
find_keyword_or_builtin(char * table,int first_tok_in_table)803 static int find_keyword_or_builtin(char *table,
804     int first_tok_in_table)
805 {
806   char s[16] = " ", *p;
807   // keywords and builtin functions are spaced 10 apart for strstr() lookup,
808   // so must be less than that long.
809   if (TT.scs->toklen >= 10) return 0;
810   strcat(s, TT.scs->tokstr);
811   strcat(s, " ");
812   p = strstr(table, s);
813   if (!p) return 0;
814   return first_tok_in_table + (p - table) / 10;
815 }
816 
find_token(void)817 static int find_token(void)
818 {
819   char s[6] = " ", *p;
820   // tokens are spaced 3 apart for strstr() lookup, so must be less than
821   // that long.
822   strcat(s, TT.scs->tokstr);
823   strcat(s, " ");
824   p = strstr(ops, s);
825   if (!p) return 0;
826   return tksemi + (p - ops) / 3;
827 }
828 
find_keyword(void)829 static int find_keyword(void)
830 {
831   return find_keyword_or_builtin(keywords, tkin);
832 }
833 
find_builtin(void)834 static int find_builtin(void)
835 {
836   return find_keyword_or_builtin(builtins, tkatan2);
837 }
838 
get_number(void)839 static void get_number(void)
840 {
841   // Assumes TT.scs->ch is digit or dot on entry.
842   // TT.scs->p points to the following character.
843   // OK formats: 1 1. 1.2 1.2E3 1.2E+3 1.2E-3 1.E2 1.E+2 1.E-2 1E2 .1 .1E2
844   // .1E+2 .1E-2
845   // NOT OK: . .E .E1 .E+ .E+1 ; 1E .1E 1.E 1.E+ 1.E- parse as number
846   // followed by variable E.
847   // gawk accepts 12.E+ and 12.E- as 12; nawk & mawk say syntax error.
848   char *leftover;
849   int len;
850   TT.scs->numval = strtod(TT.scs->p - 1, &leftover);
851   len = leftover - TT.scs->p + 1;
852   if (len == 0) {
853     append_char();
854     TT.scs->toktype = ERROR;
855     TT.scs->tok = tkerr;
856     TT.scs->error = 1;
857     FFATAL("Unexpected token '%s'\n", TT.scs->tokstr);
858     return;
859   }
860   while (len--)
861     append_char();
862 }
863 
get_string_or_regex(int endchar)864 static void get_string_or_regex(int endchar)
865 {
866   gch();
867   while (TT.scs->ch != endchar) {
868     if (TT.scs->ch == '\n') {
869       // FIXME Handle unterminated string or regex. Is this OK?
870       // FIXME TODO better diagnostic here?
871       XERR("%s\n", "unterminated string or regex");
872       break;
873     } else if (TT.scs->ch == '\\') {
874       // \\ \a \b \f \n \r \t \v \" \/ \ddd
875       char *p, *escapes = "\\abfnrtv\"/";
876       gch();
877       if (TT.scs->ch == '\n') {  // backslash newline is continuation
878         gch();
879         continue;
880       } else if ((p = strchr(escapes, TT.scs->ch))) {
881         // posix regex does not use these escapes,
882         // but awk does, so do them.
883         int c = "\\\a\b\f\n\r\t\v\"/"[p-escapes];
884         append_this_char(c);
885         // Need to double up \ inside literal regex
886         if (endchar == '/' && c == '\\') append_this_char('\\');
887         gch();
888       } else if (TT.scs->ch == 'x') {
889         gch();
890         if (isxdigit(TT.scs->ch)) {
891           int c = hexval(TT.scs->ch);
892           gch();
893           if (isxdigit(TT.scs->ch)) {
894             c = c * 16 + hexval(TT.scs->ch);
895             gch();
896           }
897           append_this_char(c);
898         } else append_this_char('x');
899       } else if (TT.scs->ch == 'u') {
900         gch();
901         if (isxdigit(TT.scs->ch)) {
902           int i = 0, j = 0, c = 0;
903           char codep[9] = {0};
904           do {
905             codep[j++] = TT.scs->ch;
906             gch();
907           } while (j < 8 && isxdigit(TT.scs->ch));
908           c = strtol(codep, 0, 16);
909           for (i = wctoutf8(codep, c), j = 0; j < i; j++)
910             append_this_char(codep[j]);
911         } else append_this_char('u');
912       } else if (isdigit(TT.scs->ch)) {
913         if (TT.scs->ch < '8') {
914           int k, c = 0;
915           for (k = 0; k < 3; k++) {
916             if (isdigit(TT.scs->ch) && TT.scs->ch < '8') {
917               c = c * 8 + TT.scs->ch - '0';
918               gch();
919             } else
920               break;
921           }
922           append_this_char(c);
923         } else {
924           append_char();
925         }
926       } else {
927         if (endchar == '/') {
928           // pass \ unmolested if not awk escape,
929           // so that regex routines can see it.
930           if (!strchr(".[]()*+?{}|^$-", TT.scs->ch)) {
931             XERR("warning: '\\%c' -- unknown regex escape\n", TT.scs->ch);
932           }
933           append_this_char('\\');
934         } else {
935           XERR("warning: '\\%c' treated as plain '%c'\n", TT.scs->ch, TT.scs->ch);
936         }
937       }
938     } else if (TT.scs->ch == EOF) {
939       FATAL("EOF in string or regex\n");
940     } else {
941       append_char();
942     }
943   }
944   gch();
945 }
946 
ascan_opt_div(int div_op_allowed_here)947 static void ascan_opt_div(int div_op_allowed_here)
948 {
949   int n;
950   for (;;) {
951     TT.scs->tokbuiltin = 0;
952     TT.scs->toklen = 0;
953     TT.scs->tokstr[0] = 0;
954     while (TT.scs->ch == ' ' || TT.scs->ch == '\t')
955       gch();
956     if (TT.scs->ch == '\\') {
957       append_char();
958       if (TT.scs->ch == '\n') {
959         gch();
960         continue;
961       }
962       TT.scs->toktype = ERROR;   // \ not last char in line.
963       TT.scs->tok = tkerr;
964       TT.scs->error = 3;
965       FATAL("backslash not last char in line\n");
966       return;
967     }
968     break;
969   }
970   // Note \<NEWLINE> in comment does not continue it.
971   if (TT.scs->ch == '#') {
972     gch();
973     while (TT.scs->ch != '\n')
974       gch();
975     // Need to fall through here to pick up newline.
976   }
977   if (TT.scs->ch == '\n') {
978     TT.scs->toktype = NEWLINE;
979     TT.scs->tok = tknl;
980     append_char();
981   } else if (isalpha(TT.scs->ch) || TT.scs->ch == '_') {
982     append_char();
983     while (isalnum(TT.scs->ch) || TT.scs->ch == '_') {
984       append_char();
985     }
986     if ((n = find_keyword()) != 0) {
987       TT.scs->toktype = KEYWORD;
988       TT.scs->tok = n;
989     } else if ((n = find_builtin()) != 0) {
990       TT.scs->toktype = BUILTIN;
991       TT.scs->tok = tkbuiltin;
992       TT.scs->tokbuiltin = n;
993     } else if ((TT.scs->ch == '(')) {
994       TT.scs->toktype = USERFUNC;
995       TT.scs->tok = tkfunc;
996     } else {
997       TT.scs->toktype = VAR;
998       TT.scs->tok = tkvar;
999       // skip whitespace to be able to check for , or )
1000       while (TT.scs->ch == ' ' || TT.scs->ch == '\t')
1001         gch();
1002     }
1003     return;
1004   } else if (TT.scs->ch == '"') {
1005     TT.scs->toktype = STRING;
1006     TT.scs->tok = tkstring;
1007     get_string_or_regex('"');
1008   } else if (isdigit(TT.scs->ch) || TT.scs->ch == '.') {
1009     TT.scs->toktype = NUMBER;
1010     TT.scs->tok = tknumber;
1011     get_number();
1012   } else if (TT.scs->ch == '/' && ! div_op_allowed_here) {
1013     TT.scs->toktype = REGEX;
1014     TT.scs->tok = tkregex;
1015     get_string_or_regex('/');
1016   } else if (TT.scs->ch == EOF) {
1017     TT.scs->toktype = EOF;
1018     TT.scs->tok = tkeof;
1019   } else if (TT.scs->ch == '\0') {
1020     append_char();
1021     TT.scs->toktype = ERROR;
1022     TT.scs->tok = tkerr;
1023     TT.scs->error = 5;
1024     FATAL("null char\n");
1025   } else {
1026     // All other tokens.
1027     TT.scs->toktype = TT.scs->ch;
1028     append_char();
1029     // Special case for **= and ** tokens
1030     if (TT.scs->toktype == '*' && TT.scs->ch == '*') {
1031       append_char();
1032       if (TT.scs->ch == '=') {
1033         append_char();
1034         TT.scs->tok = tkpowasgn;
1035       } else TT.scs->tok = tkpow;
1036       TT.scs->toktype = TT.scs->tok + 200;
1037       return;
1038     }
1039     // Is it a 2-character token?
1040     if (TT.scs->ch != ' ' && TT.scs->ch != '\n') {
1041       append_this_char(TT.scs->ch);
1042       if (find_token()) {
1043         TT.scs->tok = find_token();
1044         TT.scs->toktype = TT.scs->tok + 200;
1045         gch();  // Eat second char of token.
1046         return;
1047       }
1048       TT.scs->toklen--;  // Not 2-character token; back off.
1049       TT.scs->tokstr[TT.scs->toklen] = 0;
1050     }
1051     TT.scs->tok = find_token();
1052     if (TT.scs->tok) return;
1053     TT.scs->toktype = ERROR;
1054     TT.scs->tok = tkerr;
1055     TT.scs->error = 4;
1056     FFATAL("Unexpected token '%s'\n", TT.scs->tokstr);
1057   }
1058 }
1059 
scan_opt_div(int div_op_allowed_here)1060 static void scan_opt_div(int div_op_allowed_here)
1061 {
1062   // TODO FIXME need better diags for bad tokens!
1063   // TODO Also set global syntax error flag.
1064   do ascan_opt_div(div_op_allowed_here); while (TT.scs->tok == tkerr);
1065 }
1066 
init_scanner(void)1067 static void init_scanner(void)
1068 {
1069   TT.prevtok = tkeof;
1070   gch();
1071 }
1072 
1073 // POSIX says '/' does not begin a regex wherever '/' or '/=' can mean divide.
1074 // Pretty sure if / or /= comes after these, it means divide:
1075 static char div_preceders[] = {tknumber, tkstring, tkvar, tkgetline, tkrparen, tkrbracket, tkincr, tkdecr, 0};
1076 
1077 // For checking end of prev statement for termination and if '/' can come next
1078 
scan(void)1079 static void scan(void)
1080 {
1081   TT.prevtok = TT.scs->tok;
1082   if (TT.prevtok && strchr(div_preceders, TT.prevtok)) scan_opt_div(1);
1083   else scan_opt_div(0);
1084   TT.tokstr = TT.scs->tokstr;
1085 }
1086 
1087 ////////////////////
1088 //// compile
1089 ////////////////////
1090 
1091 //  NOTES:
1092 //  NL ok after , { && || do else OR after right paren after if/while/for
1093 //  TODO:
1094 //    see case tkgetline -- test more
1095 //    case tkmatchop, tknotmatch -- fix ~ (/re/)
1096 
1097 // Forward declarations -- for mutually recursive parsing functions
1098 static int expr(int rbp);
1099 static void lvalue(void);
1100 static int primary(void);
1101 static void stmt(void);
1102 static void action(int action_type);
1103 
1104 #define CURTOK() (TT.scs->tok)
1105 #define ISTOK(toknum) (TT.scs->tok == (toknum))
1106 
havetok(int tk)1107 static int havetok(int tk)
1108 {
1109   if (!ISTOK(tk)) return 0;
1110   scan();
1111   return 1;
1112 }
1113 
1114 //// code and "literal" emitters
gen2cd(int op,int n)1115 static void gen2cd(int op, int n)
1116 {
1117   zlist_append(&TT.zcode, &op);
1118   TT.zcode_last = zlist_append(&TT.zcode, &n);
1119 }
1120 
gencd(int op)1121 static void gencd(int op)
1122 {
1123   TT.zcode_last = zlist_append(&TT.zcode, &op);
1124 }
1125 
make_literal_str_val(char * s)1126 static int make_literal_str_val(char *s)
1127 {
1128   // Only if no nul inside string!
1129   struct zvalue v = new_str_val(s);
1130   return zlist_append(&TT.literals, &v);
1131 }
1132 
make_literal_regex_val(char * s)1133 static int make_literal_regex_val(char *s)
1134 {
1135   regex_t *rx;
1136   rx = xmalloc(sizeof(*rx));
1137   xregcomp(rx, s, REG_EXTENDED);
1138   struct zvalue v = ZVINIT(ZF_RX, 0, 0);
1139   v.rx = rx;
1140   // Flag empty rx to make it easy to identify for split() special case
1141   if (!*s) v.flags |= ZF_EMPTY_RX;
1142   return zlist_append(&TT.literals, &v);
1143 }
1144 
make_literal_num_val(double num)1145 static int make_literal_num_val(double num)
1146 {
1147   struct zvalue v = ZVINIT(ZF_NUM, num, 0);
1148   return zlist_append(&TT.literals, &v);
1149 }
1150 
make_uninit_val(void)1151 static int make_uninit_val(void)
1152 {
1153   struct zvalue v = uninit_zvalue;
1154   return zlist_append(&TT.literals, &v);
1155 }
1156 //// END code and "literal" emitters
1157 
1158 //// Symbol tables functions
find_func_def_entry(char * s)1159 static int find_func_def_entry(char *s)
1160 {
1161   for (int k = 1; k < zlist_len(&TT.func_def_table); k++)
1162     if (!strcmp(s, FUNC_DEF[k].name)) return k;
1163   return 0;
1164 }
1165 
add_func_def_entry(char * s)1166 static int add_func_def_entry(char *s)
1167 {
1168   struct functab_slot ent = {0, 0, 0, {0, 0, 0, 0}, 0};
1169   ent.name = xstrdup(s);
1170   int slotnum = zlist_append(&TT.func_def_table, &ent);
1171   FUNC_DEF[slotnum].slotnum = slotnum;
1172   return slotnum;
1173 }
1174 
find_global(char * s)1175 static int find_global(char *s)
1176 {
1177   for (int k = 1; k < zlist_len(&TT.globals_table); k++)
1178     if (!strcmp(s, GLOBAL[k].name)) return k;
1179   return 0;
1180 }
1181 
add_global(char * s)1182 static int add_global(char *s)
1183 {
1184   struct symtab_slot ent = {0, 0, 0};
1185   ent.name = xstrdup(s);
1186   int slotnum = zlist_append(&TT.globals_table, &ent);
1187   GLOBAL[slotnum].slotnum = slotnum;
1188   return slotnum;
1189 }
1190 
find_local_entry(char * s)1191 static int find_local_entry(char *s)
1192 {
1193   for (int k = 1; k < zlist_len(&TT.locals_table); k++)
1194     if (!strcmp(s, LOCAL[k].name)) return k;
1195   return 0;
1196 }
1197 
add_local_entry(char * s)1198 static int add_local_entry(char *s)
1199 {
1200   struct symtab_slot ent = {0, 0, 0};
1201   ent.name = xstrdup(s);
1202   int slotnum = zlist_append(&TT.locals_table, &ent);
1203   LOCAL[slotnum].slotnum = slotnum;
1204   return slotnum;
1205 }
1206 
find_or_add_var_name(void)1207 static int find_or_add_var_name(void)
1208 {
1209   int slotnum = 0;    // + means global; - means local to function
1210   int globals_ent = 0;
1211   int locals_ent = find_local_entry(TT.tokstr);   // in local symbol table?
1212   if (locals_ent) {
1213     slotnum = -LOCAL[locals_ent].slotnum;
1214   } else {
1215     globals_ent = find_global(TT.tokstr);
1216     if (!globals_ent) globals_ent = add_global(TT.tokstr);
1217     slotnum = GLOBAL[globals_ent].slotnum;
1218     if (find_func_def_entry(TT.tokstr))
1219       // POSIX: The same name shall not be used both as a variable name
1220       // with global scope and as the name of a function.
1221       XERR("var '%s' used as function name\n", TT.tokstr);
1222   }
1223   return slotnum;
1224 }
1225 
1226 //// END Symbol tables functions
1227 
1228 //// Initialization
init_locals_table(void)1229 static void init_locals_table(void)
1230 {
1231   static struct symtab_slot locals_ent;
1232   zlist_init(&TT.locals_table, sizeof(struct symtab_slot));
1233   zlist_append(&TT.locals_table, &locals_ent);
1234 }
1235 
init_tables(void)1236 static void init_tables(void)
1237 {
1238   static struct symtab_slot global_ent;
1239   static struct functab_slot func_ent;
1240 
1241   // Append dummy elements in lists to force valid offsets nonzero.
1242   zlist_init(&TT.globals_table, sizeof(struct symtab_slot));
1243   zlist_append(&TT.globals_table, &global_ent);
1244   zlist_init(&TT.func_def_table, sizeof(struct functab_slot));
1245   zlist_append(&TT.func_def_table, &func_ent);
1246   init_locals_table();
1247   zlist_init(&TT.zcode, sizeof(int));
1248   gencd(tkeof);   // to ensure zcode offsets are non-zero
1249   zlist_init(&TT.literals, sizeof(struct zvalue));
1250   // Init stack size at twice MIN_STACK_LEFT. MIN_STACK_LEFT is at least as
1251   // many entries as any statement may ever take.  Currently there is no diag
1252   // if this is exceeded; prog. will probably crash. 1024 should be plenty?
1253   zlist_initx(&TT.stack, sizeof(struct zvalue), 2 * MIN_STACK_LEFT);
1254   TT.stackp = (struct zvalue *)TT.stack.base;
1255   zlist_init(&TT.fields, sizeof(struct zvalue));
1256   zlist_append(&TT.literals, &uninit_zvalue);
1257   zlist_append(&TT.stack, &uninit_zvalue);
1258   zlist_append(&TT.fields, &uninit_zvalue);
1259   FIELD[0].vst = new_zstring("", 0);
1260 }
1261 
init_compiler(void)1262 static void init_compiler(void)
1263 {
1264   // Special variables (POSIX). Must align with enum spec_var_names
1265   static char *spec_vars[] = { "ARGC", "ARGV", "CONVFMT", "ENVIRON", "FILENAME",
1266       "FNR", "FS", "NF", "NR", "OFMT", "OFS", "ORS", "RLENGTH", "RS", "RSTART",
1267       "SUBSEP", 0};
1268 
1269   init_tables();
1270   for (int k = 0; spec_vars[k]; k++) {
1271     TT.spec_var_limit = add_global(spec_vars[k]);
1272     GLOBAL[TT.spec_var_limit++].flags |= (k == 1 || k == 3) ? ZF_MAP : ZF_SCALAR;
1273     push_val(&uninit_zvalue);
1274   }
1275 }
1276 //// END Initialization
1277 
1278 //// Parsing and compiling to TT.zcode
1279 // Left binding powers
1280 static int lbp_table[] = {  // Must align with enum Toks
1281   0, 0, 0, 0,     // tkunusedtoken, tkeof, tkerr, tknl,
1282   250, 250, 250,  // tkvar, tknumber, tkstring,
1283   250, 250, 250,  // tkregex, tkfunc, tkbuiltin,
1284   0, 0, 210, 0, // tksemi, tkcomma, tklbracket, tkrbracket,
1285   200, 0, 0, 0, // tklparen, tkrparen, tklbrace, tkrbrace,
1286   190, 180, 180, 170, 160, // tkfield, tkincr, tkdecr, tkpow, tknot,
1287   150, 150, 150, 140, 140, // tkmul, tkdiv, tkmod, tkplus, tkminus,
1288   130, // tkcat, // FAKE (?) optor for concatenation (adjacent string exprs)
1289   110, 110, 110, 110, 110, 110, // tklt, tkle, tkne, tkeq, tkgt, tkge,
1290   100, 100, // tkmatchop, tknotmatch,
1291   80, 70, // tkand, tkor,
1292   60, 0, // tkternif, tkternelse,
1293   50, 50, 50, 50,   // tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn,
1294   50, 50, 50, // tkaddasgn, tksubasgn, tkasgn,
1295   0, 120, // tkappend, tkpipe,
1296   90 // tkin
1297 };
1298 
getlbp(int tok)1299 static int getlbp(int tok)
1300 {
1301   // FIXME: should tkappend be here too? is tkpipe needed?
1302   // In print statement outside parens: make '>' end an expression
1303   if (TT.cgl.in_print_stmt && ! TT.cgl.paren_level && (tok == tkgt || tok == tkpipe))
1304     return 0;
1305   return (0 <= tok && tok <= tkin) ? lbp_table[tok] :
1306     // getline is special, not a normal builtin.
1307     // close, index, match, split, sub, gsub, sprintf, substr
1308     // are really builtin functions though bwk treats them as keywords.
1309     (tkgetline <= tok && tok <= tksubstr) ? 240 : 0;     // FIXME 240 is temp?
1310 }
1311 
1312 // Get right binding power. Same as left except for right associative optors
getrbp(int tok)1313 static int getrbp(int tok)
1314 {
1315   int lbp = getlbp(tok);
1316   // ternary (?:), assignment, power ops are right associative
1317   return (lbp <= 60 || lbp == 170) ? lbp - 1 : lbp;
1318 }
1319 
unexpected_eof(void)1320 static void unexpected_eof(void)
1321 {
1322   error_exit("terminated with error(s)");
1323 }
1324 
1325 //// syntax error diagnostic and recovery (Turner's method)
1326 // D.A. Turner, Error diagnosis and recovery in one pass compilers,
1327 // Information Processing Letters, Volume 6, Issue 4, 1977, Pages 113-115
1328 static int recovering = 0;
1329 
complain(int tk)1330 static void complain(int tk)
1331 {
1332   char op[3], tkstr[10];
1333   if (recovering) return;
1334   recovering = 1;
1335   if (!strcmp(TT.tokstr, "\n")) TT.tokstr = "<newline>";
1336   if (tksemi <= tk && tk <= tkpipe) {
1337     get_token_text(op, tk);
1338     XERR("syntax near '%s' -- '%s' expected\n", TT.tokstr, op);
1339   } else if (tk >= tkin && tk <= tksubstr) {
1340     if (tk < tkatan2) memmove(tkstr, keywords + 1 + 10 * (tk - tkin), 10);
1341     else memmove(tkstr, builtins + 1 + 10 * (tk - tkatan2), 10);
1342     *strchr(tkstr, ' ') = 0;
1343     XERR("syntax near '%s' -- '%s' expected\n", TT.tokstr, tkstr);
1344   } else XERR("syntax near '%s'\n", TT.tokstr);
1345 }
1346 
expect(int tk)1347 static void expect(int tk)
1348 {
1349   if (recovering) {
1350     while (!ISTOK(tkeof) && !ISTOK(tk))
1351       scan();
1352     if (ISTOK(tkeof)) unexpected_eof();
1353     scan(); // consume expected token
1354     recovering = 0;
1355   } else if (!havetok(tk)) complain(tk);
1356 }
1357 
skip_to(char * tklist)1358 static void skip_to(char *tklist)
1359 {
1360   do scan(); while (!ISTOK(tkeof) && !strchr(tklist, CURTOK()));
1361   if (ISTOK(tkeof)) unexpected_eof();
1362 }
1363 
1364 //// END syntax error diagnostic and recovery (Turner's method)
1365 
optional_nl_or_semi(void)1366 static void optional_nl_or_semi(void)
1367 {
1368   while (havetok(tknl) || havetok(tksemi))
1369     ;
1370 }
1371 
optional_nl(void)1372 static void optional_nl(void)
1373 {
1374   while (havetok(tknl))
1375     ;
1376 }
1377 
rparen(void)1378 static void rparen(void)
1379 {
1380   expect(tkrparen);
1381   optional_nl();
1382 }
1383 
have_comma(void)1384 static int have_comma(void)
1385 {
1386   if (!havetok(tkcomma)) return 0;
1387   optional_nl();
1388   return 1;
1389 }
1390 
check_set_map(int slotnum)1391 static void check_set_map(int slotnum)
1392 {
1393   // POSIX: The same name shall not be used within the same scope both as
1394   // a scalar variable and as an array.
1395   if (slotnum < 0 && LOCAL[-slotnum].flags & ZF_SCALAR)
1396     XERR("scalar param '%s' used as array\n", LOCAL[-slotnum].name);
1397   if (slotnum > 0 && GLOBAL[slotnum].flags & ZF_SCALAR)
1398     XERR("scalar var '%s' used as array\n", GLOBAL[slotnum].name);
1399   if (slotnum < 0) LOCAL[-slotnum].flags |= ZF_MAP;
1400   if (slotnum > 0) GLOBAL[slotnum].flags |= ZF_MAP;
1401 }
1402 
check_set_scalar(int slotnum)1403 static void check_set_scalar(int slotnum)
1404 {
1405   if (slotnum < 0 && LOCAL[-slotnum].flags & ZF_MAP)
1406     XERR("array param '%s' used as scalar\n", LOCAL[-slotnum].name);
1407   if (slotnum > 0 && GLOBAL[slotnum].flags & ZF_MAP)
1408     XERR("array var '%s' used as scalar\n", GLOBAL[slotnum].name);
1409   if (slotnum < 0) LOCAL[-slotnum].flags |= ZF_SCALAR;
1410   if (slotnum > 0) GLOBAL[slotnum].flags |= ZF_SCALAR;
1411 }
1412 
map_name(void)1413 static void map_name(void)
1414 {
1415   int slotnum;
1416   check_set_map(slotnum = find_or_add_var_name());
1417   gen2cd(tkvar, slotnum);
1418 }
1419 
check_builtin_arg_counts(int tk,int num_args,char * fname)1420 static void check_builtin_arg_counts(int tk, int num_args, char *fname)
1421 {
1422   static char builtin_1_arg[] = { tkcos, tksin, tkexp, tklog, tksqrt, tkint,
1423                                   tktolower, tktoupper, tkclose, tksystem, 0};
1424   static char builtin_2_arg[] = { tkatan2, tkmatch, tkindex, tklshift, tkrshift, 0};
1425   static char builtin_al_2_arg[] = { tkband, tkbor, tkbxor, 0};
1426   static char builtin_2_3_arg[] = { tksub, tkgsub, tksplit, tksubstr, 0};
1427   static char builtin_0_1_arg[] = { tksrand, tklength, tkfflush, 0};
1428 
1429   if (tk == tkrand && num_args)
1430     XERR("function '%s' expected no args, got %d\n", fname, num_args);
1431   else if (strchr(builtin_1_arg, tk) && num_args != 1)
1432     XERR("function '%s' expected 1 arg, got %d\n", fname, num_args);
1433   else if (strchr(builtin_2_arg, tk) && num_args != 2)
1434     XERR("function '%s' expected 2 args, got %d\n", fname, num_args);
1435   else if (strchr(builtin_al_2_arg, tk) && num_args < 2)
1436     XERR("function '%s' expected at least 2 args, got %d\n", fname, num_args);
1437   else if (strchr(builtin_2_3_arg, tk) && num_args != 2 && num_args != 3)
1438     XERR("function '%s' expected 2 or 3 args, got %d\n", fname, num_args);
1439   else if (strchr(builtin_0_1_arg, tk) && num_args != 0 && num_args != 1)
1440     XERR("function '%s' expected no arg or 1 arg, got %d\n", fname, num_args);
1441 }
1442 
builtin_call(int tk,char * builtin_name)1443 static void builtin_call(int tk, char *builtin_name)
1444 {
1445   int num_args = 0;
1446   expect(tklparen);
1447   TT.cgl.paren_level++;
1448   switch (tk) {
1449     case tksub:
1450     case tkgsub:
1451       if (ISTOK(tkregex)) {
1452         gen2cd(tkregex, make_literal_regex_val(TT.tokstr));
1453         scan();
1454       } else expr(0);
1455       expect(tkcomma);
1456       optional_nl();
1457       expr(0);
1458       if (have_comma()) {
1459         lvalue();
1460       } else {
1461         gen2cd(tknumber, make_literal_num_val(0));
1462         gen2cd(opfldref, tkeof);
1463       }
1464       num_args = 3;
1465       break;
1466 
1467     case tkmatch:
1468       expr(0);
1469       expect(tkcomma);
1470       optional_nl();
1471       if (ISTOK(tkregex)) {
1472         gen2cd(tkregex, make_literal_regex_val(TT.tokstr));
1473         scan();
1474       } else expr(0);
1475       num_args = 2;
1476       break;
1477 
1478     case tksplit:
1479       expr(0);
1480       expect(tkcomma);
1481       optional_nl();
1482       if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) {
1483         map_name();
1484         scan();
1485       } else {
1486         XERR("%s\n", "expected array name as split() 2nd arg");
1487         expr(0);
1488       }
1489       // FIXME some recovery needed here!?
1490       num_args = 2;
1491       if (have_comma()) {
1492         if (ISTOK(tkregex)) {
1493           gen2cd(tkregex, make_literal_regex_val(TT.tokstr));
1494           scan();
1495         } else expr(0);
1496         num_args++;
1497       }
1498       break;
1499 
1500     case tklength:
1501       if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) {
1502         gen2cd(tkvar, find_or_add_var_name());
1503         scan();
1504         num_args++;
1505       }
1506       ATTR_FALLTHROUGH_INTENDED;
1507 
1508     default:
1509       if (ISTOK(tkrparen)) break;
1510       do {
1511         expr(0);
1512         num_args++;
1513       } while (have_comma());
1514       break;
1515   }
1516   expect(tkrparen);
1517   TT.cgl.paren_level--;
1518 
1519   check_builtin_arg_counts(tk, num_args, builtin_name);
1520 
1521   gen2cd(tk, num_args);
1522 }
1523 
function_call(void)1524 static void function_call(void)
1525 {
1526   // Function call: generate TT.zcode to:
1527   //  push placeholder for return value, push placeholder for return addr,
1528   //  push args, then push number of args, then:
1529   //      for builtins: gen opcode (e.g. tkgsub)
1530   //      for user func: gen (tkfunc, function location)
1531   //      if function not yet defined, location will be filled in when defined
1532   //          the location slots will be chained from the symbol table
1533   int functk = 0, funcnum = 0;
1534   char builtin_name[16];  // be sure it's long enough for all builtins
1535   if (ISTOK(tkbuiltin)) {
1536     functk = TT.scs->tokbuiltin;
1537     strcpy(builtin_name, TT.tokstr);
1538   } else if (ISTOK(tkfunc)) { // user function
1539     funcnum = find_func_def_entry(TT.tokstr);
1540     if (!funcnum) funcnum = add_func_def_entry(TT.tokstr);
1541     FUNC_DEF[funcnum].flags |= FUNC_CALLED;
1542     gen2cd(opprepcall, funcnum);
1543   } else error_exit("bad function %s!", TT.tokstr);
1544   scan();
1545   // length() can appear without parens
1546   int num_args = 0;
1547   if (functk == tklength && !ISTOK(tklparen)) {
1548     gen2cd(functk, 0);
1549     return;
1550   }
1551   if (functk) {   // builtin
1552     builtin_call(functk, builtin_name);
1553     return;
1554   }
1555   expect(tklparen);
1556   TT.cgl.paren_level++;
1557   if (ISTOK(tkrparen)) {
1558     scan();
1559   } else {
1560     do {
1561       if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) {
1562         // Function call arg that is a lone variable. Cannot tell in this
1563         // context if it is a scalar or map. Just add it to symbol table.
1564         gen2cd(tkvar, find_or_add_var_name());
1565         scan();
1566       } else expr(0);
1567       num_args++;
1568     } while (have_comma());
1569     expect(tkrparen);
1570   }
1571   TT.cgl.paren_level--;
1572   gen2cd(tkfunc, num_args);
1573 }
1574 
var(void)1575 static void var(void)
1576 {
1577   // var name is in TT.tokstr
1578   // slotnum: + means global; - means local to function
1579   int slotnum = find_or_add_var_name();
1580   scan();
1581   if (havetok(tklbracket)) {
1582     check_set_map(slotnum);
1583     int num_subscripts = 0;
1584     do {
1585       expr(0);
1586       num_subscripts++;
1587     } while (have_comma());
1588     expect(tkrbracket);
1589     if (num_subscripts > 1) gen2cd(tkrbracket, num_subscripts);
1590     gen2cd(opmap, slotnum);
1591   } else {
1592     check_set_scalar(slotnum);
1593     gen2cd(tkvar, slotnum);
1594   }
1595 }
1596 
1597 //   Dollar $ tkfield can be followed by "any" expresson, but
1598 //   the way it binds varies.
1599 //   The following are valid lvalues:
1600 //   $ ( expr )
1601 //   $ tkvar $ tknumber $ tkstring $ tkregex
1602 //   $ tkfunc(...)
1603 //   $ tkbuiltin(...)
1604 //   $ length   # with no parens after
1605 //   $ tkclose(), ... $ tksubstr
1606 //   $ tkgetline FIXME TODO TEST THIS
1607 //   $ ++ lvalue
1608 //   $ -- lvalue
1609 //   $ + expression_up_to_exponentiation (also -, ! prefix ops)
1610 //   $ $ whatever_can_follow_and_bind_to_dollar
1611 //
1612 //     tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus,
1613 //     tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline,
1614 //     tkclose, tkindex, tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr
1615 //
1616 // ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $k*k }'
1617 // 18
1618 // ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $+k*k }'
1619 // 18
1620 // ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $k^k }'
1621 // 81
1622 // ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $+k^k }'
1623 // 8
1624 
field_op(void)1625 static void field_op(void)
1626 {
1627   // CURTOK() must be $ here.
1628   expect(tkfield);
1629   // tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus,
1630   // tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, tkclose, tkindex,
1631   // tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr
1632   if (ISTOK(tkfield)) field_op();
1633   else if (ISTOK(tkvar)) var();
1634   else primary();
1635   // tkfield op has "dummy" 2nd word so that convert_push_to_reference(void)
1636   // can find either tkfield or tkvar at same place (ZCODE[TT.zcode_last-1]).
1637   gen2cd(tkfield, tkeof);
1638 }
1639 
1640 // Tokens that can start expression
1641 static char exprstartsy[] = {tkvar, tknumber, tkstring, tkregex, tkfunc,
1642   tkbuiltin, tkfield, tkminus, tkplus, tknot, tkincr, tkdecr, tklparen,
1643   tkgetline, tkclose, tkindex, tkmatch, tksplit, tksub, tkgsub, tksprintf,
1644   tksubstr, tkband, tkbor, tkbxor, tkrshift, tklshift, 0};
1645 
1646 // Tokens that can end statement
1647 static char stmtendsy[] = {tknl, tksemi, tkrbrace, 0};
1648 
1649 // Tokens that can follow expressions of a print statement
1650 static char printexprendsy[] = {tkgt, tkappend, tkpipe, tknl, tksemi, tkrbrace, 0};
1651 
1652 // !! Ensure this:
1653 // ternary op is right associative, so
1654 // a ? b : c ? d : e        evaluates as
1655 // a ? b : (c ? d : e)      not as
1656 // (a ? b : c) ? d : e
1657 
convert_push_to_reference(void)1658 static void convert_push_to_reference(void)
1659 {
1660   if (ZCODE[TT.zcode_last - 1] == tkvar) ZCODE[TT.zcode_last-1] = opvarref;
1661   else if (ZCODE[TT.zcode_last - 1] == opmap) ZCODE[TT.zcode_last - 1] = opmapref;
1662   else if (ZCODE[TT.zcode_last - 1] == tkfield) ZCODE[TT.zcode_last - 1] = opfldref;
1663   else error_exit("bad lvalue?");
1664 }
1665 
lvalue(void)1666 static void lvalue(void)
1667 {
1668   if (ISTOK(tkfield)) {
1669     field_op();
1670     convert_push_to_reference();
1671   } else if (ISTOK(tkvar)) {
1672     var();
1673     convert_push_to_reference();
1674   } else {
1675     XERR("syntax near '%s' (bad lvalue)\n", TT.tokstr);
1676   }
1677 }
1678 
primary(void)1679 static int primary(void)
1680 {
1681   //  On entry: CURTOK() is first token of expression
1682   //  On exit: CURTOK() is infix operator (for binary_op() to handle) or next
1683   //   token after end of expression.
1684   //  return -1 for field or var (potential lvalue);
1685   //      2 or more for comma-separated expr list
1686   //          as in "multiple subscript expression in array"
1687   //          e.g. (1, 2) in array_name, or a print/printf list;
1688   //      otherwise return 0
1689   //
1690   //  expr can start with:
1691   //      tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus,
1692   //      tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, tkclose, tkindex,
1693   //      tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr
1694   //
1695   //  bwk treats these as keywords, not builtins: close index match split sub gsub
1696   //      sprintf substr
1697   //
1698   //  bwk builtins are: atan2 cos sin exp log sqrt int rand srand length tolower
1699   //      toupper system fflush
1700   //  NOTE: fflush() is NOT in POSIX awk
1701   //
1702   //  primary() must consume prefix and postfix operators as well as
1703   //      num, string, regex, var, var with subscripts, and function calls
1704 
1705   int num_exprs = 0;
1706   int nargs, modifier;
1707   int tok = CURTOK();
1708   switch (tok) {
1709     case tkvar:
1710     case tkfield:
1711       if (ISTOK(tkvar)) var();
1712       else field_op();
1713       if (ISTOK(tkincr) || ISTOK(tkdecr)) {
1714         convert_push_to_reference();
1715         gencd(CURTOK());
1716         scan();
1717       } else return -1;
1718       break;
1719 
1720     case tknumber:
1721       gen2cd(tknumber, make_literal_num_val(TT.scs->numval));
1722       scan();
1723       break;
1724 
1725     case tkstring:
1726       gen2cd(tkstring, make_literal_str_val(TT.tokstr));
1727       scan();
1728       break;
1729 
1730     case tkregex:
1731       // When an ERE token appears as an expression in any context other
1732       // than as the right-hand of the '~' or "!~" operator or as one of
1733       // the built-in function arguments described below, the value of
1734       // the resulting expression shall be the equivalent of: $0 ~ /ere/
1735       // FIXME TODO
1736       gen2cd(opmatchrec, make_literal_regex_val(TT.tokstr));
1737       scan();
1738       break;
1739 
1740     case tkbuiltin: // various builtins
1741     case tkfunc:    // user-defined function
1742       function_call();
1743       break;
1744 
1745     // Unary prefix ! + -
1746     case tknot:
1747     case tkminus:
1748     case tkplus:
1749       scan();
1750       expr(getlbp(tknot));   // unary +/- same precedence as !
1751       if (tok == tknot) gencd(tknot);
1752       else gencd(opnegate);               // forces to number
1753       if (tok == tkplus) gencd(opnegate); // forces to number
1754       break;
1755 
1756       // Unary prefix ++ -- MUST take lvalue
1757     case tkincr:
1758     case tkdecr:
1759       scan();
1760       lvalue();
1761       if (tok == tkincr) gencd(oppreincr);
1762       else gencd(oppredecr);
1763       break;
1764 
1765     case tklparen:
1766       scan();
1767       TT.cgl.paren_level++;
1768       num_exprs = 0;
1769       do {
1770         expr(0);
1771         num_exprs++;
1772       } while (have_comma());
1773       expect(tkrparen);
1774       TT.cgl.paren_level--;
1775       if (num_exprs > 1) return num_exprs;
1776       break;
1777 
1778     case tkgetline:
1779       // getline may be (according to awk book):
1780       // getline [var [<file]]
1781       // getline <file
1782       // cmd | getline [var]
1783       // var must be lvalue (can be any lvalue?)
1784       scan();
1785       nargs = 0;
1786       modifier = tkeof;
1787       if (ISTOK(tkfield) || ISTOK(tkvar)) {
1788         lvalue();
1789         nargs++;
1790       }
1791       if (havetok(tklt)) {
1792         expr(getrbp(tkcat));   // bwk "historical practice" precedence
1793         nargs++;
1794         modifier = tklt;
1795       }
1796       gen2cd(tkgetline, nargs);
1797       gencd(modifier);
1798       break;
1799 
1800     default:
1801       XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr);
1802       skip_to(stmtendsy);
1803       break;
1804   }
1805   return 0;
1806 }
1807 
binary_op(int optor)1808 static void binary_op(int optor)  // Also for ternary ?: optor.
1809 {
1810   int nargs, cdx = 0;  // index in TT.zcode list
1811   int rbp = getrbp(optor);
1812   if (optor != tkcat) scan();
1813   // CURTOK() holds first token of right operand.
1814   switch (optor) {
1815     case tkin:
1816       // right side of 'in' must be (only) an array name
1817       map_name();
1818       gencd(tkin);
1819       scan();
1820       // FIXME TODO 20230109 x = y in a && 2 works OK?
1821       // x = y in a + 2 does not; it's parsed as x = (y in a) + 2
1822       // The +2 is not cat'ed with (y in a) as in bwk's OTA.
1823       // Other awks see y in a + 2 as a syntax error. They (may)
1824       // not want anything after y in a except a lower binding operator
1825       // (&& || ?:) or end of expression, i.e. ')' ';' '}'
1826       break;
1827 
1828   case tkpipe:
1829       expect(tkgetline);
1830       nargs = 1;
1831       if (ISTOK(tkfield) || ISTOK(tkvar)) {
1832         lvalue();
1833         nargs++;
1834       }
1835       gen2cd(tkgetline, nargs);
1836       gencd(tkpipe);
1837       break;
1838 
1839   case tkand:
1840   case tkor:
1841       optional_nl();
1842       gen2cd(optor, -1);  // tkand: jump if false, else drop
1843       cdx = TT.zcode_last;   // tkor:  jump if true, else drop
1844       expr(rbp);
1845       gencd(opnotnot);    // replace TT.stack top with truth value
1846       ZCODE[cdx] = TT.zcode_last - cdx;
1847       break;
1848 
1849   case tkternif:
1850       gen2cd(optor, -1);
1851       cdx = TT.zcode_last;
1852       expr(0);
1853       expect(tkternelse);
1854       gen2cd(tkternelse, -1);
1855       ZCODE[cdx] = TT.zcode_last - cdx;
1856       cdx = TT.zcode_last;
1857       expr(rbp);
1858       ZCODE[cdx] = TT.zcode_last - cdx;
1859       break;
1860 
1861   case tkmatchop:
1862   case tknotmatch:
1863       expr(rbp);
1864       if (ZCODE[TT.zcode_last - 1] == opmatchrec) ZCODE[TT.zcode_last - 1] = tkregex;
1865       gencd(optor);
1866       break;
1867 
1868   default:
1869       expr(rbp);
1870       gencd(optor);
1871   }
1872 }
1873 
cat_start_concated_expr(int tok)1874 static int cat_start_concated_expr(int tok)
1875 {
1876   // concat'ed expr can start w/ var number string func builtin $ ! ( (or ++ if prev was not lvalue)
1877   static char exprstarttermsy[] = {tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin,
1878     tkfield, tknot, tkincr, tkdecr, tklparen, tkgetline, 0};
1879 
1880   // NOTE this depends on builtins (close etc) being >= tkgetline
1881   return !! strchr(exprstarttermsy, tok) || tok >= tkgetline;
1882 }
1883 
1884 #define CALLED_BY_PRINT 99987 // Arbitrary, different from any real rbp value
1885 
expr(int rbp)1886 static int expr(int rbp)
1887 {
1888   // On entry: TT.scs has first symbol of expression, e.g. var, number, string,
1889   // regex, func, getline, left paren, prefix op ($ ++ -- ! unary + or -) etc.
1890   static char asgnops[] = {tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn,
1891     tkaddasgn, tksubasgn, tkasgn, 0};
1892   int prim_st = primary();
1893   // If called directly by print_stmt(), and found a parenthesized expression list
1894   //    followed by an end of print statement: any of > >> | ; } <newline>
1895   //    Then: return the count of expressions in list
1896   //    Else: continue parsing an expression
1897   if (rbp == CALLED_BY_PRINT) {
1898     if (prim_st > 0 && strchr(printexprendsy, CURTOK())) return prim_st;
1899     else rbp = 0;
1900   }
1901 
1902   // mult_expr_list in parens must be followed by 'in' unless it
1903   // immediately follows print or printf, where it may still be followed
1904   // by 'in' ... unless at end of statement
1905   if (prim_st > 0 && ! ISTOK(tkin))
1906     XERR("syntax near '%s'; expected 'in'\n", TT.tokstr);
1907   if (prim_st > 0) gen2cd(tkrbracket, prim_st);
1908   // primary() has eaten subscripts, function args, postfix ops.
1909   // CURTOK() should be a binary op.
1910   int optor = CURTOK();
1911   if (strchr(asgnops, optor)) {
1912 
1913     // TODO FIXME ?  NOT SURE IF THIS WORKS RIGHT!
1914     // awk does not parse according to POSIX spec in some odd cases.
1915     // When an assignment (lvalue =) is on the right of certain operators,
1916     // it is not treated as a bad lvalue (as it is in C).
1917     // Example: (1 && a=2) # no error; the assignment is performed.
1918     // This happens for ?: || && ~ !~ < <= ~= == > >=
1919     //
1920     static char odd_assignment_rbp[] = {59, 60, 70, 80, 100, 110, 0};
1921     if (prim_st < 0 && (rbp <= getrbp(optor) || strchr(odd_assignment_rbp, rbp))) {
1922       convert_push_to_reference();
1923       scan();
1924       expr(getrbp(optor));
1925       gencd(optor);
1926       return 0;
1927     }
1928     XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr);
1929     skip_to(stmtendsy);
1930   }
1931   if (cat_start_concated_expr(optor)) optor = tkcat;
1932   while (rbp < getlbp(optor)) {
1933     binary_op(optor);
1934     // HERE tok s/b an operator or expression terminator ( ; etc.).
1935     optor = CURTOK();
1936     if (cat_start_concated_expr(optor)) optor = tkcat;
1937   }
1938   return 0;
1939 }
1940 
print_stmt(int tk)1941 static void print_stmt(int tk)
1942 {
1943   static char outmodes[] = {tkgt, tkappend, tkpipe, 0};
1944   int num_exprs = 0, outmode;
1945   TT.cgl.in_print_stmt = 1;
1946   expect(tk); // tkprint or tkprintf
1947   if ((tk == tkprintf) || !strchr(printexprendsy, CURTOK())) {
1948     // printf always needs expression
1949     // print non-empty statement needs expression
1950     num_exprs = expr(CALLED_BY_PRINT);
1951     if (num_exprs > 0 && !strchr(printexprendsy, CURTOK())) FATAL("print stmt bug");
1952     if (!num_exprs) {
1953       for (num_exprs++; have_comma(); num_exprs++)
1954         expr(0);
1955     }
1956   }
1957   outmode = CURTOK();
1958   if (strchr(outmodes, outmode)) {
1959     scan();
1960     expr(0); // FIXME s/b only bwk term? check POSIX
1961     num_exprs++;
1962   } else outmode = 0;
1963   gen2cd(tk, num_exprs);
1964   gencd(outmode);
1965   TT.cgl.in_print_stmt = 0;
1966 }
1967 
delete_stmt(void)1968 static void delete_stmt(void)
1969 {
1970   expect(tkdelete);
1971   if (ISTOK(tkvar)) {
1972     int slotnum = find_or_add_var_name();
1973     check_set_map(slotnum);
1974     scan();
1975     if (havetok(tklbracket)) {
1976       int num_subscripts = 0;
1977       do {
1978         expr(0);
1979         num_subscripts++;
1980       } while (have_comma());
1981       expect(tkrbracket);
1982       if (num_subscripts > 1) gen2cd(tkrbracket, num_subscripts);
1983       gen2cd(opmapref, slotnum);
1984       gencd(tkdelete);
1985     } else {
1986       // delete entire map (elements only; var is still a map)
1987       gen2cd(opmapref, slotnum);
1988       gencd(opmapdelete);
1989     }
1990   } else expect(tkvar);
1991 }
1992 
simple_stmt(void)1993 static void simple_stmt(void)
1994 {
1995   if (strchr(exprstartsy, CURTOK())) {
1996     expr(0);
1997     gencd(opdrop);
1998     return;
1999   }
2000   switch (CURTOK()) {
2001     case tkprint:
2002     case tkprintf:
2003       print_stmt(CURTOK());
2004       break;
2005 
2006     case tkdelete:
2007       delete_stmt();
2008       break;
2009 
2010     default:
2011       XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr);
2012       skip_to(stmtendsy);
2013   }
2014 }
2015 
prev_was_terminated(void)2016 static int prev_was_terminated(void)
2017 {
2018   return !!strchr(stmtendsy, TT.prevtok);
2019 }
2020 
is_nl_semi(void)2021 static int is_nl_semi(void)
2022 {
2023   return ISTOK(tknl) || ISTOK(tksemi);
2024 }
2025 
if_stmt(void)2026 static void if_stmt(void)
2027 {
2028   expect(tkif);
2029   expect(tklparen);
2030   expr(0);
2031   rparen();
2032   gen2cd(tkif, -1);
2033   int cdx = TT.zcode_last;
2034   stmt();
2035   if (!prev_was_terminated() && is_nl_semi()) {
2036     scan();
2037     optional_nl();
2038   }
2039   if (prev_was_terminated()) {
2040     optional_nl();
2041     if (havetok(tkelse)) {
2042       gen2cd(tkelse, -1);
2043       ZCODE[cdx] = TT.zcode_last - cdx;
2044       cdx = TT.zcode_last;
2045       optional_nl();
2046       stmt();
2047     }
2048   }
2049   ZCODE[cdx] = TT.zcode_last - cdx;
2050 }
2051 
save_break_continue(int * brk,int * cont)2052 static void save_break_continue(int *brk, int *cont)
2053 {
2054   *brk = TT.cgl.break_dest;
2055   *cont = TT.cgl.continue_dest;
2056 }
2057 
restore_break_continue(int * brk,int * cont)2058 static void restore_break_continue(int *brk, int *cont)
2059 {
2060   TT.cgl.break_dest = *brk;
2061   TT.cgl.continue_dest = *cont;
2062 }
2063 
while_stmt(void)2064 static void while_stmt(void)
2065 {
2066   int brk, cont;
2067   save_break_continue(&brk, &cont);
2068   expect(tkwhile);
2069   expect(tklparen);
2070   TT.cgl.continue_dest = TT.zcode_last + 1;
2071   expr(0);
2072   rparen();
2073   gen2cd(tkwhile, 2);    // drop, jump if true
2074   TT.cgl.break_dest = TT.zcode_last + 1;
2075   gen2cd(opjump, -1);     // jump here to break
2076   stmt();
2077   gen2cd(opjump, -1);     // jump to continue
2078   ZCODE[TT.zcode_last] = TT.cgl.continue_dest - TT.zcode_last - 1;
2079   ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2080   restore_break_continue(&brk, &cont);
2081 }
2082 
do_stmt(void)2083 static void do_stmt(void)
2084 {
2085   int brk, cont;
2086   save_break_continue(&brk, &cont);
2087   expect(tkdo);
2088   optional_nl();
2089   gen2cd(opjump, 4);   // jump over jumps, to statement
2090   TT.cgl.continue_dest = TT.zcode_last + 1;
2091   gen2cd(opjump, -1);   // here on continue
2092   TT.cgl.break_dest = TT.zcode_last + 1;
2093   gen2cd(opjump, -1);   // here on break
2094   stmt();
2095   if (!prev_was_terminated()) {
2096     if (is_nl_semi()) {
2097       scan();
2098       optional_nl();
2099     } else {
2100       XERR("syntax near '%s' -- ';' or newline expected\n", TT.tokstr);
2101       // FIXME
2102     }
2103   }
2104   ZCODE[TT.cgl.continue_dest + 1] = TT.zcode_last - TT.cgl.continue_dest - 1;
2105   optional_nl();
2106   expect(tkwhile);
2107   expect(tklparen);
2108   expr(0);
2109   rparen();
2110   gen2cd(tkwhile, TT.cgl.break_dest - TT.zcode_last - 1);
2111   ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2112   restore_break_continue(&brk, &cont);
2113 }
2114 
for_not_map_iter(void)2115 static void for_not_map_iter(void)
2116 {
2117   // Here after loop initialization, if any; loop condition
2118   int condition_loc = TT.zcode_last + 1;
2119   if (havetok(tksemi)) {
2120     // "endless" loop variant; no condition
2121     // no NL allowed here in OTA
2122     gen2cd(opjump, -1);     // jump to statement
2123   } else {
2124     optional_nl();                // NOT posix or awk book; in OTA
2125     expr(0);                 // loop while true
2126     expect(tksemi);
2127     gen2cd(tkwhile, -1);    // drop, jump to statement if true
2128   }
2129   optional_nl();                    // NOT posix or awk book; in OTA
2130   TT.cgl.break_dest = TT.zcode_last + 1;
2131   gen2cd(opjump, -1);
2132   TT.cgl.continue_dest = TT.zcode_last + 1;
2133   if (!ISTOK(tkrparen)) simple_stmt();  // "increment"
2134   gen2cd(opjump, condition_loc - TT.zcode_last - 3);
2135   rparen();
2136   ZCODE[TT.cgl.break_dest - 1] = TT.zcode_last - TT.cgl.break_dest + 1;
2137   stmt();
2138   gen2cd(opjump, TT.cgl.continue_dest - TT.zcode_last - 3);
2139   ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2140 }
2141 
valid_for_array_iteration(int first,int last)2142 static int valid_for_array_iteration(int first, int last)
2143 {
2144   return ZCODE[first] == tkvar && ZCODE[first + 2] == tkvar
2145       && ZCODE[first + 4] == tkin && ZCODE[first + 5] == opdrop
2146       && first + 5 == last;
2147 }
2148 
for_stmt(void)2149 static void for_stmt(void)
2150 {
2151   int brk, cont;
2152   save_break_continue(&brk, &cont);
2153   expect(tkfor);
2154   expect(tklparen);
2155   if (havetok(tksemi)) {
2156     // No "initialization" part
2157     for_not_map_iter();
2158   } else {
2159     int loop_start_loc = TT.zcode_last + 1;
2160     simple_stmt();  // initializaton part, OR varname in arrayname form
2161     if (!havetok(tkrparen)) {
2162       expect(tksemi);
2163       for_not_map_iter();
2164     } else {
2165       // Must be map iteration
2166       // Check here for varname in varname!
2167       // FIXME TODO must examine generated TT.zcode for var in array?
2168       if (!valid_for_array_iteration(loop_start_loc, TT.zcode_last))
2169         XERR("%s", "bad 'for (var in array)' loop\n");
2170       else {
2171         ZCODE[TT.zcode_last-5] = opvarref;
2172         ZCODE[TT.zcode_last-1] = tknumber;
2173         ZCODE[TT.zcode_last] = make_literal_num_val(-1);
2174         TT.cgl.continue_dest = TT.zcode_last + 1;
2175         gen2cd(opmapiternext, 2);
2176         TT.cgl.break_dest = TT.zcode_last + 1;
2177         gen2cd(opjump, -1);   // fill in with loc after stmt
2178       }
2179       optional_nl();
2180       // fixup TT.stack if return or exit inside for (var in array)
2181       TT.cgl.stack_offset_to_fix += 3;
2182       stmt();
2183       TT.cgl.stack_offset_to_fix -= 3;
2184       gen2cd(opjump, TT.cgl.continue_dest - TT.zcode_last - 3);
2185       ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2186       gencd(opdrop);
2187       gencd(opdrop);
2188       gencd(opdrop);
2189     }
2190   }
2191   restore_break_continue(&brk, &cont);
2192 }
2193 
stmt(void)2194 static void stmt(void)
2195 {
2196   switch (CURTOK()) {
2197     case tkeof:
2198       break;     // FIXME ERROR?
2199 
2200     case tkbreak:
2201       scan();
2202       if (TT.cgl.break_dest) gen2cd(tkbreak, TT.cgl.break_dest - TT.zcode_last - 3);
2203       else XERR("%s", "break not in a loop\n");
2204       break;
2205 
2206     case tkcontinue:
2207       scan();
2208       if (TT.cgl.continue_dest)
2209         gen2cd(tkcontinue, TT.cgl.continue_dest - TT.zcode_last - 3);
2210       else XERR("%s", "continue not in a loop\n");
2211       break;
2212 
2213     case tknext:
2214       scan();
2215       gencd(tknext);
2216       if (TT.cgl.rule_type) XERR("%s", "next inside BEGIN or END\n");
2217       if (TT.cgl.in_function_body) XERR("%s", "next inside function def\n");
2218       break;
2219 
2220     case tknextfile:
2221       scan();
2222       gencd(tknextfile);
2223       if (TT.cgl.rule_type) XERR("%s", "nextfile inside BEGIN or END\n");
2224       if (TT.cgl.in_function_body) XERR("%s", "nextfile inside function def\n");
2225       break;
2226 
2227     case tkexit:
2228       scan();
2229       if (strchr(exprstartsy, CURTOK())) {
2230         expr(0);
2231       } else gen2cd(tknumber, make_literal_num_val(NO_EXIT_STATUS));
2232       gencd(tkexit);
2233       break;
2234 
2235     case tkreturn:
2236       scan();
2237       if (TT.cgl.stack_offset_to_fix) gen2cd(opdrop_n, TT.cgl.stack_offset_to_fix);
2238       if (strchr(exprstartsy, CURTOK())) {
2239         expr(0);
2240       } else gen2cd(tknumber, make_literal_num_val(0.0));
2241       gen2cd(tkreturn, TT.cgl.nparms);
2242       if (!TT.cgl.in_function_body) XERR("%s", "return outside function def\n");
2243       break;
2244 
2245     case tklbrace:
2246       action(tklbrace);
2247       break;
2248 
2249     case tkif:
2250       if_stmt();
2251       break;
2252 
2253     case tkwhile:
2254       while_stmt();
2255       break;
2256 
2257     case tkdo:
2258       do_stmt();
2259       break;
2260 
2261     case tkfor:
2262       for_stmt();
2263       break;
2264 
2265     case tksemi:
2266       scan();
2267       break;
2268     default:
2269       simple_stmt();      // expression print printf delete
2270   }
2271 }
2272 
add_param(int funcnum,char * s)2273 static void add_param(int funcnum, char *s)
2274 {
2275   if (!find_local_entry(s)) add_local_entry(s);
2276   else XERR("function '%s' dup param '%s'\n", FUNC_DEF[funcnum].name, s);
2277   TT.cgl.nparms++;
2278 
2279   // POSIX: The same name shall not be used as both a function parameter name
2280   // and as the name of a function or a special awk variable.
2281   // !!! NOTE seems implementations exc. mawk only compare param names with
2282   // builtin funcs; use same name as userfunc is OK!
2283   if (!strcmp(s, FUNC_DEF[funcnum].name))
2284     XERR("function '%s' param '%s' matches func name\n",
2285         FUNC_DEF[funcnum].name, s);
2286   if (find_global(s) && find_global(s) < TT.spec_var_limit)
2287     XERR("function '%s' param '%s' matches special var\n",
2288         FUNC_DEF[funcnum].name, s);
2289 }
2290 
function_def(void)2291 static void function_def(void)
2292 {
2293   expect(tkfunction);
2294   int funcnum = find_func_def_entry(TT.tokstr);
2295   if (!funcnum) {
2296     funcnum = add_func_def_entry(TT.tokstr);
2297   } else if (FUNC_DEF[funcnum].flags & FUNC_DEFINED) {
2298     XERR("dup defined function '%s'\n", TT.tokstr);
2299   }
2300   FUNC_DEF[funcnum].flags |= FUNC_DEFINED;
2301   if (find_global(TT.tokstr)) {
2302     // POSIX: The same name shall not be used both as a variable name with
2303     // global scope and as the name of a function.
2304     XERR("function name '%s' previously defined\n", TT.tokstr);
2305   }
2306 
2307   gen2cd(tkfunction, funcnum);
2308   FUNC_DEF[funcnum].zcode_addr = TT.zcode_last - 1;
2309   TT.cgl.funcnum = funcnum;
2310   TT.cgl.nparms = 0;
2311   if (ISTOK(tkfunc)) expect(tkfunc); // func name with no space before (
2312   else expect(tkvar);  // func name with space before (
2313   expect(tklparen);
2314   if (ISTOK(tkvar)) {
2315     add_param(funcnum, TT.tokstr);
2316     scan();
2317     // FIXME is the the best way? what if TT.tokstr not a tkvar?
2318     while (have_comma()) {
2319       add_param(funcnum, TT.tokstr);
2320       expect(tkvar);
2321     }
2322   }
2323   rparen();
2324   if (ISTOK(tklbrace)) {
2325     TT.cgl.in_function_body = 1;
2326     action(tkfunc);
2327     TT.cgl.in_function_body = 0;
2328     // Need to return uninit value if falling off end of function.
2329     gen2cd(tknumber, make_uninit_val());
2330     gen2cd(tkreturn, TT.cgl.nparms);
2331   } else {
2332     XERR("syntax near '%s'\n", TT.tokstr);
2333     // FIXME some recovery needed here!?
2334   }
2335   // Do not re-init locals table for dup function.
2336   // Avoids memory leak detected by LeakSanitizer.
2337   if (!FUNC_DEF[funcnum].function_locals.base) {
2338     FUNC_DEF[funcnum].function_locals = TT.locals_table;
2339     init_locals_table();
2340   }
2341 }
2342 
action(int action_type)2343 static void action(int action_type)
2344 {
2345 (void)action_type;
2346   // action_type is tkbegin, tkend, tkdo (every line), tkif (if pattern),
2347   //                  tkfunc (function body), tklbrace (compound statement)
2348   // Should have lbrace on entry.
2349   expect(tklbrace);
2350   for (;;) {
2351     if (ISTOK(tkeof)) unexpected_eof();
2352     optional_nl_or_semi();
2353     if (havetok(tkrbrace)) {
2354       break;
2355     }
2356     stmt();
2357     // stmt() is normally unterminated here, but may be terminated if we
2358     // have if with no else (had to consume terminator looking for else)
2359     //   !!!   if (ISTOK(tkrbrace) || prev_was_terminated())
2360     if (prev_was_terminated()) continue;
2361     if (!is_nl_semi() && !ISTOK(tkrbrace)) {
2362       XERR("syntax near '%s' -- newline, ';', or '}' expected\n", TT.tokstr);
2363       while (!is_nl_semi() && !ISTOK(tkrbrace) && !ISTOK(tkeof)) scan();
2364       if (ISTOK(tkeof)) unexpected_eof();
2365     }
2366     if (havetok(tkrbrace)) break;
2367     // Must be semicolon or newline
2368     scan();
2369   }
2370 }
2371 
rule(void)2372 static void rule(void)
2373 {
2374   //       pa_pat
2375   //     | pa_pat lbrace stmtlist '}'
2376   //     | pa_pat ',' opt_nl pa_pat
2377   //     | pa_pat ',' opt_nl pa_pat lbrace stmtlist '}'
2378   //     | lbrace stmtlist '}'
2379   //     | XBEGIN lbrace stmtlist '}'
2380   //     | XEND lbrace stmtlist '}'
2381   //     | FUNC funcname '(' varlist rparen  lbrace stmtlist '}'
2382 
2383   switch (CURTOK()) {
2384     case tkbegin:
2385       scan();
2386       if (TT.cgl.last_begin) ZCODE[TT.cgl.last_begin] = TT.zcode_last - TT.cgl.last_begin;
2387       else TT.cgl.first_begin = TT.zcode_last + 1;
2388 
2389       TT.cgl.rule_type = tkbegin;
2390       action(tkbegin);
2391       TT.cgl.rule_type = 0;
2392       gen2cd(opjump, -1);
2393       TT.cgl.last_begin = TT.zcode_last;
2394       break;
2395 
2396     case tkend:
2397       scan();
2398       if (TT.cgl.last_end) ZCODE[TT.cgl.last_end] = TT.zcode_last - TT.cgl.last_end;
2399       else TT.cgl.first_end = TT.zcode_last + 1;
2400 
2401       TT.cgl.rule_type = tkbegin;
2402       action(tkend);
2403       TT.cgl.rule_type = 0;
2404       gen2cd(opjump, -1);
2405       TT.cgl.last_end = TT.zcode_last;
2406       break;
2407 
2408     case tklbrace:
2409       if (TT.cgl.last_recrule)
2410         ZCODE[TT.cgl.last_recrule] = TT.zcode_last - TT.cgl.last_recrule;
2411       else TT.cgl.first_recrule = TT.zcode_last + 1;
2412       action(tkdo);
2413       gen2cd(opjump, -1);
2414       TT.cgl.last_recrule = TT.zcode_last;
2415       break;
2416 
2417     case tkfunction:
2418       function_def();
2419       break;
2420     default:
2421       if (TT.cgl.last_recrule)
2422         ZCODE[TT.cgl.last_recrule] = TT.zcode_last - TT.cgl.last_recrule;
2423       else TT.cgl.first_recrule = TT.zcode_last + 1;
2424       gen2cd(opjump, 1);
2425       gencd(tkeof);
2426       int cdx = 0, saveloc = TT.zcode_last;
2427       expr(0);
2428       if (!have_comma()) {
2429         gen2cd(tkif, -1);
2430         cdx = TT.zcode_last;
2431       } else {
2432         gen2cd(oprange2, ++TT.cgl.range_pattern_num);
2433         gencd(-1);
2434         cdx = TT.zcode_last;
2435         ZCODE[saveloc-2] = oprange1;
2436         ZCODE[saveloc-1] = TT.cgl.range_pattern_num;
2437         ZCODE[saveloc] = TT.zcode_last - saveloc;
2438         expr(0);
2439         gen2cd(oprange3, TT.cgl.range_pattern_num);
2440       }
2441       if (ISTOK(tklbrace)) {
2442         action(tkif);
2443         ZCODE[cdx] = TT.zcode_last - cdx;
2444       } else {
2445         gencd(opprintrec);   // print $0 ?
2446         ZCODE[cdx] = TT.zcode_last - cdx;
2447       }
2448       gen2cd(opjump, -1);
2449       TT.cgl.last_recrule = TT.zcode_last;
2450   }
2451 }
2452 
diag_func_def_ref(void)2453 static void diag_func_def_ref(void)
2454 {
2455   int n = zlist_len(&TT.func_def_table);
2456   for (int k = 1; k < n; k++) {
2457     if ((FUNC_DEF[k].flags & FUNC_CALLED) &&
2458             !(FUNC_DEF[k].flags & FUNC_DEFINED)) {
2459       // Sorry, we can't tell where this was called from, for now at least.
2460       XERR("Undefined function '%s'", FUNC_DEF[k].name);
2461     }
2462   }
2463 }
2464 
compile(void)2465 static void compile(void)
2466 {
2467   init_compiler();
2468   init_scanner();
2469   scan();
2470   optional_nl_or_semi();        // Does posix allow NL or ; before first rule?
2471   while (! ISTOK(tkeof)) {
2472     rule();
2473     optional_nl_or_semi();        // NOT POSIX
2474   }
2475 
2476 
2477   if (TT.cgl.last_begin) ZCODE[TT.cgl.last_begin-1] = opquit;
2478   if (TT.cgl.last_end) ZCODE[TT.cgl.last_end-1] = opquit;
2479   if (TT.cgl.last_recrule) ZCODE[TT.cgl.last_recrule-1] = opquit;
2480 
2481   gen2cd(tknumber, make_literal_num_val(0.0));
2482   gencd(tkexit);
2483   gencd(opquit);
2484   // If there are only BEGIN and END or only END actions, generate actions to
2485   // read all input before END.
2486   if (TT.cgl.first_end && !TT.cgl.first_recrule) {
2487     gencd(opquit);
2488     TT.cgl.first_recrule = TT.zcode_last;
2489   }
2490   gencd(opquit);  // One more opcode to keep ip in bounds in run code.
2491   diag_func_def_ref();
2492 }
2493 
2494 ////////////////////
2495 //// runtime
2496 ////////////////////
2497 
check_numeric_string(struct zvalue * v)2498 static void check_numeric_string(struct zvalue *v)
2499 {
2500   if (v->vst) {
2501     char *end, *s = v->vst->str;
2502     // Significant speed gain with this test:
2503     // num string must begin space, +, -, ., or digit.
2504     if (strchr("+-.1234567890 ", *s)) {
2505       double num = strtod(s, &end);
2506       if (s == end || end[strspn(end, " ")]) return;
2507       v->num = num;
2508       v->flags |= ZF_NUM | ZF_STR | ZF_NUMSTR;
2509     }
2510   }
2511 }
2512 
num_to_zstring(double n,char * fmt)2513 static struct zstring *num_to_zstring(double n, char *fmt)
2514 {
2515   int k;
2516   if (n == (long long)n) k = snprintf(TT.pbuf, PBUFSIZE, "%lld", (long long)n);
2517   else k = snprintf(TT.pbuf, PBUFSIZE, fmt, n);
2518   if (k < 0 || k >= PBUFSIZE) FFATAL("error encoding %f via '%s'", n, fmt);
2519   return new_zstring(TT.pbuf, k);
2520 }
2521 
2522 ////////////////////
2523 //// regex routines
2524 ////////////////////
2525 
escape_str(char * s,int is_regex)2526 static char *escape_str(char *s, int is_regex)
2527 {
2528   char *p, *escapes = is_regex ? "abfnrtv\"/" : "\\abfnrtv\"/";
2529   // FIXME TODO should / be in there?
2530   char *s0 = s, *to = s;
2531   while ((*to = *s)) {
2532     if (*s != '\\') { to++, s++;
2533     } else if ((p = strchr(escapes, *++s))) {
2534       // checking char after \ for known escapes
2535       int c = (is_regex?"\a\b\f\n\r\t\v\"/":"\\\a\b\f\n\r\t\v\"/")[p-escapes];
2536       if (c) *to = c, s++;  // else final backslash
2537       to++;
2538     } else if ('0' <= *s && *s <= '9') {
2539       int k, c = *s++ - '0';
2540       for (k = 0; k < 2 && '0' <= *s && *s <= '9'; k++)
2541         c = c * 8 + *s++ - '0';
2542       *to++ = c;
2543     } else if (*s == 'x') {
2544       if (isxdigit(s[1])) {
2545         int c = hexval(*++s);
2546         if (isxdigit(s[1])) c = c * 16 + hexval(*++s);
2547         *to++ = c, s++;
2548       }
2549     } else {
2550       if (is_regex) *to++ = '\\';
2551       *to++ = *s++;
2552     }
2553   }
2554   return s0;
2555 }
2556 
force_maybemap_to_scalar(struct zvalue * v)2557 static void force_maybemap_to_scalar(struct zvalue *v)
2558 {
2559   if (!(v->flags & ZF_ANYMAP)) return;
2560   if (v->flags & ZF_MAP || v->map->count)
2561     FATAL("array in scalar context");
2562   v->flags = 0;
2563   v->map = 0; // v->flags = v->map = 0 gets warning
2564 }
2565 
force_maybemap_to_map(struct zvalue * v)2566 static void force_maybemap_to_map(struct zvalue *v)
2567 {
2568   if (v->flags & ZF_MAYBEMAP) v->flags = ZF_MAP;
2569 }
2570 
2571 // fmt_offs is either CONVFMT or OFMT (offset in stack to zvalue)
to_str_fmt(struct zvalue * v,int fmt_offs)2572 static struct zvalue *to_str_fmt(struct zvalue *v, int fmt_offs)
2573 {
2574   force_maybemap_to_scalar(v);
2575   // TODO: consider handling numstring differently
2576   if (v->flags & ZF_NUMSTR) v->flags = ZF_STR;
2577   if (IS_STR(v)) return v;
2578   else if (!v->flags) { // uninitialized
2579     v->vst = new_zstring("", 0);
2580   } else if (IS_NUM(v)) {
2581     zvalue_release_zstring(v);
2582     if (!IS_STR(&STACK[fmt_offs])) {
2583       zstring_release(&STACK[fmt_offs].vst);
2584       STACK[fmt_offs].vst = num_to_zstring(STACK[fmt_offs].num, "%.6g");
2585       STACK[fmt_offs].flags = ZF_STR;
2586     }
2587     v->vst = num_to_zstring(v->num, STACK[fmt_offs].vst->str);
2588   } else {
2589     FATAL("Wrong or unknown type in to_str_fmt\n");
2590   }
2591   v->flags = ZF_STR;
2592   return v;
2593 }
2594 
to_str(struct zvalue * v)2595 static struct zvalue *to_str(struct zvalue *v)
2596 {
2597   return to_str_fmt(v, CONVFMT);
2598 }
2599 
2600 // TODO FIXME Is this needed? (YES -- investigate) Just use to_str()?
2601 #define ENSURE_STR(v) (IS_STR(v) ? (v) : to_str(v))
2602 
rx_zvalue_compile(regex_t ** rx,struct zvalue * pat)2603 static void rx_zvalue_compile(regex_t **rx, struct zvalue *pat)
2604 {
2605   if (IS_RX(pat)) *rx = pat->rx;
2606   else {
2607     zvalue_dup_zstring(to_str(pat));
2608     escape_str(pat->vst->str, 1);
2609     xregcomp(*rx, pat->vst->str, REG_EXTENDED);
2610   }
2611 }
2612 
rx_zvalue_free(regex_t * rx,struct zvalue * pat)2613 static void rx_zvalue_free(regex_t *rx, struct zvalue *pat)
2614 {
2615   if (!IS_RX(pat) || rx != pat->rx) regfree(rx);
2616 }
2617 
2618 // Used by the match/not match ops (~ !~) and implicit $0 match (/regex/)
match(struct zvalue * zvsubject,struct zvalue * zvpat)2619 static int match(struct zvalue *zvsubject, struct zvalue *zvpat)
2620 {
2621   int r;
2622   regex_t rx, *rxp = &rx;
2623   rx_zvalue_compile(&rxp, zvpat);
2624   if ((r = regexec(rxp, to_str(zvsubject)->vst->str, 0, 0, 0)) != 0) {
2625     if (r != REG_NOMATCH) {
2626       char errbuf[256];
2627       regerror(r, &rx, errbuf, sizeof(errbuf));
2628       // FIXME TODO better diagnostic here
2629       error_exit("regex match error %d: %s", r, errbuf);
2630     }
2631     rx_zvalue_free(rxp, zvpat);
2632     return 1;
2633   }
2634   rx_zvalue_free(rxp, zvpat);
2635   return 0;
2636 }
2637 
rx_find(regex_t * rx,char * s,regoff_t * start,regoff_t * end,int eflags)2638 static int rx_find(regex_t *rx, char *s, regoff_t *start, regoff_t *end, int eflags)
2639 {
2640   regmatch_t matches[1];
2641   int r = regexec(rx, s, 1, matches, eflags);
2642   if (r == REG_NOMATCH) return r;
2643   if (r) FATAL("regexec error");  // TODO ? use regerr() to meaningful msg
2644   *start = matches[0].rm_so;
2645   *end = matches[0].rm_eo;
2646   return 0;
2647 }
2648 
2649 // Differs from rx_find() in that FS cannot match null (empty) string.
2650 // See https://www.austingroupbugs.net/view.php?id=1468.
rx_find_FS(regex_t * rx,char * s,regoff_t * start,regoff_t * end,int eflags)2651 static int rx_find_FS(regex_t *rx, char *s, regoff_t *start, regoff_t *end, int eflags)
2652 {
2653   int r = rx_find(rx, s, start, end, eflags);
2654   if (r || *start != *end) return r;  // not found, or found non-empty match
2655   // Found empty match, retry starting past the match
2656   char *p = s + *end;
2657   if (!*p) return REG_NOMATCH;  // End of string, no non-empty match found
2658   // Empty match not at EOS, move ahead and try again
2659   while (!r && *start == *end && *++p)
2660     r = rx_find(rx, p, start, end, eflags);
2661   if (r || !*p) return REG_NOMATCH;  // no non-empty match found
2662   *start += p - s;  // offsets from original string
2663   *end += p - s;
2664   return 0;
2665 }
2666 
2667 ////////////////////
2668 ////   fields
2669 ////////////////////
2670 
2671 #define FIELDS_MAX  102400 // Was 1024; need more for toybox awk test
2672 #define THIS_MEANS_SET_NF 999999999
2673 
get_int_val(struct zvalue * v)2674 static int get_int_val(struct zvalue *v)
2675 {
2676   if (IS_NUM(v)) return (int)v->num;
2677   if (IS_STR(v) && v->vst) return (int)atof(v->vst->str);
2678   return 0;
2679 }
2680 
2681 // A single-char FS is never a regex, so make it a [<char>] regex to
2682 // match only that one char in case FS is a regex metachar.
2683 // If regex FS is needed, must use > 1 char. If a '.' regex
2684 // is needed, use e.g. '.|.' (unlikely case).
fmt_one_char_fs(char * fs)2685 static char *fmt_one_char_fs(char *fs)
2686 {
2687   if (strlen(fs) != 1) return fs;
2688   snprintf(TT.one_char_fs, sizeof(TT.one_char_fs), "[%c]", fs[0]);
2689   return TT.one_char_fs;
2690 }
2691 
rx_fs_prep(char * fs)2692 static regex_t *rx_fs_prep(char *fs)
2693 {
2694   if (!strcmp(fs, " ")) return &TT.rx_default;
2695   if (!strcmp(fs, TT.fs_last)) return &TT.rx_last;
2696   if (strlen(fs) >= FS_MAX) FATAL("FS too long");
2697   strcpy(TT.fs_last, fs);
2698   regfree(&TT.rx_last);
2699   xregcomp(&TT.rx_last, fmt_one_char_fs(fs), REG_EXTENDED);
2700   return &TT.rx_last;
2701 }
2702 
2703 // Only for use by split() builtin
set_map_element(struct zmap * m,int k,char * val,size_t len)2704 static void set_map_element(struct zmap *m, int k, char *val, size_t len)
2705 {
2706   // Do not need format here b/c k is integer, uses "%lld" format.
2707   struct zstring *key = num_to_zstring(k, "");// "" vs 0 format avoids warning
2708   struct zmap_slot *zs = zmap_find_or_insert_key(m, key);
2709   zstring_release(&key);
2710   zs->val.vst = zstring_update(zs->val.vst, 0, val, len);
2711   zs->val.flags = ZF_STR;
2712   check_numeric_string(&zs->val);
2713 }
2714 
set_zvalue_str(struct zvalue * v,char * s,size_t size)2715 static void set_zvalue_str(struct zvalue *v, char *s, size_t size)
2716 {
2717   v->vst = zstring_update(v->vst, 0, s, size);
2718   v->flags = ZF_STR;
2719 }
2720 
2721 // All changes to NF go through here!
set_nf(int nf)2722 static void set_nf(int nf)
2723 {
2724   STACK[NF].num = TT.nf_internal = nf;
2725   STACK[NF].flags = ZF_NUM;
2726 }
2727 
set_field(struct zmap * unused,int fnum,char * s,size_t size)2728 static void set_field(struct zmap *unused, int fnum, char *s, size_t size)
2729 { (void)unused;
2730   if (fnum < 0 || fnum > FIELDS_MAX) FFATAL("bad field num %d\n", fnum);
2731   int nfields = zlist_len(&TT.fields);
2732   // Need nfields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields
2733   while (nfields <= fnum)
2734     nfields = zlist_append(&TT.fields, &uninit_zvalue) + 1;
2735   set_zvalue_str(&FIELD[fnum], s, size);
2736   set_nf(fnum);
2737   check_numeric_string(&FIELD[fnum]);
2738 }
2739 
2740 // Split s via fs, using setter; return number of TT.fields.
2741 // This is used to split TT.fields and also for split() builtin.
splitter(void (* setter)(struct zmap *,int,char *,size_t),struct zmap * m,char * s,struct zvalue * zvfs)2742 static int splitter(void (*setter)(struct zmap *, int, char *, size_t), struct zmap *m, char *s, struct zvalue *zvfs)
2743 {
2744   regex_t *rx;
2745   regoff_t offs, end;
2746   if (!IS_RX(zvfs)) to_str(zvfs);
2747   char *fs = IS_STR(zvfs) ? zvfs->vst->str : "";
2748   int nf = 0, r = 0, eflag = 0;
2749   // Empty string or empty fs (regex).
2750   // Need to include !*s b/c empty string, otherwise
2751   // split("", a, "x") splits to a 1-element (empty element) array
2752   if (!*s || (IS_STR(zvfs) && !*fs) || IS_EMPTY_RX(zvfs)) {
2753     for ( ; *s; s++) setter(m, ++nf, s, 1);
2754     return nf;
2755   }
2756   if (IS_RX(zvfs)) rx = zvfs->rx;
2757   else rx = rx_fs_prep(fs);
2758   while (*s) {
2759     // Find the next occurrence of FS.
2760     // rx_find_FS() returns 0 if found. If nonzero, the field will
2761     // be the rest of the record (all of it if first time through).
2762     if ((r = rx_find_FS(rx, s, &offs, &end, eflag))) offs = end = strlen(s);
2763     else {
2764       int k = strcspn(s, "\n");
2765       if (k < offs) offs = k, end = k + 1;
2766     }
2767     eflag |= REG_NOTBOL;
2768 
2769     // Field will be s up to (not including) the offset. If offset
2770     // is zero and FS is found and FS is ' ' (TT.rx_default "[ \t]+"),
2771     // then the find is the leading or trailing spaces and/or tabs.
2772     // If so, skip this (empty) field, otherwise set field, length is offs.
2773     if (offs || r || rx != &TT.rx_default) setter(m, ++nf, s, offs);
2774     s += end;
2775   }
2776   if (!r && rx != &TT.rx_default) setter(m, ++nf, "", 0);
2777   return nf;
2778 }
2779 
build_fields(void)2780 static void build_fields(void)
2781 {
2782   char *rec = FIELD[0].vst->str;
2783   // TODO test this -- why did I not want to split empty $0?
2784   // Maybe don't split empty $0 b/c non-default FS gets NF==1 with splitter()?
2785   set_nf(*rec ? splitter(set_field, 0, rec, to_str(&STACK[FS])) : 0);
2786 }
2787 
rebuild_field0(void)2788 static void rebuild_field0(void)
2789 {
2790   struct zstring *s = FIELD[0].vst;
2791   int nf = TT.nf_internal;
2792   // uninit value needed for eventual reference to .vst in zstring_release()
2793   struct zvalue tempv = uninit_zvalue;
2794   zvalue_copy(&tempv, to_str(&STACK[OFS]));
2795   for (int i = 1; i <= nf; i++) {
2796     if (i > 1) {
2797       s = s ? zstring_extend(s, tempv.vst) : zstring_copy(s, tempv.vst);
2798     }
2799     if (FIELD[i].flags) to_str(&FIELD[i]);
2800     if (FIELD[i].vst) {
2801       if (i > 1) s = zstring_extend(s, FIELD[i].vst);
2802       else s = zstring_copy(s, FIELD[i].vst);
2803     }
2804   }
2805   FIELD[0].vst = s;
2806   FIELD[0].flags |= ZF_STR;
2807   zvalue_release_zstring(&tempv);
2808 }
2809 
2810 // get field ref (lvalue ref) in prep for assignment to field.
2811 // [... assigning to a nonexistent field (for example, $(NF+2)=5) shall
2812 // increase the value of NF; create any intervening TT.fields with the
2813 // uninitialized value; and cause the value of $0 to be recomputed, with the
2814 // TT.fields being separated by the value of OFS.]
2815 // Called by setup_lvalue()
get_field_ref(int fnum)2816 static struct zvalue *get_field_ref(int fnum)
2817 {
2818   if (fnum < 0 || fnum > FIELDS_MAX) error_exit("bad field num %d", fnum);
2819   if (fnum > TT.nf_internal) {
2820     // Ensure TT.fields list is large enough for fnum
2821     // Need len of TT.fields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields
2822     for (int i = TT.nf_internal + 1; i <= fnum; i++) {
2823       if (i == zlist_len(&TT.fields)) zlist_append(&TT.fields, &uninit_zvalue);
2824       zvalue_copy(&FIELD[i], &uninit_string_zvalue);
2825     }
2826     set_nf(fnum);
2827   }
2828   return &FIELD[fnum];
2829 }
2830 
2831 // Called by tksplit op
split(struct zstring * s,struct zvalue * a,struct zvalue * fs)2832 static int split(struct zstring *s, struct zvalue *a, struct zvalue *fs)
2833 {
2834   return splitter(set_map_element, a->map, s->str, fs);
2835 }
2836 
2837 // Called by getrec_f0_f() and getrec_f0()
copy_to_field0(char * buf,size_t k)2838 static void copy_to_field0(char *buf, size_t k)
2839 {
2840   set_zvalue_str(&FIELD[0], buf, k);
2841   check_numeric_string(&FIELD[0]);
2842   build_fields();
2843 }
2844 
2845 // After changing $0, must rebuild TT.fields & reset NF
2846 // Changing other field must rebuild $0
2847 // Called by gsub() and assignment ops.
fixup_fields(int fnum)2848 static void fixup_fields(int fnum)
2849 {
2850   if (fnum == THIS_MEANS_SET_NF) {  // NF was assigned to
2851     int new_nf = get_int_val(&STACK[NF]);
2852     // Ensure TT.fields list is large enough for fnum
2853     // Need len of TT.fields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields
2854     for (int i = TT.nf_internal + 1; i <= new_nf; i++) {
2855       if (i == zlist_len(&TT.fields)) zlist_append(&TT.fields, &uninit_zvalue);
2856       zvalue_copy(&FIELD[i], &uninit_string_zvalue);
2857     }
2858     set_nf(TT.nf_internal = STACK[NF].num);
2859     rebuild_field0();
2860     return;
2861   }
2862   // fnum is # of field that was just updated.
2863   // If it's 0, need to rebuild the TT.fields 1... n.
2864   // If it's non-0, need to rebuild field 0.
2865   to_str(&FIELD[fnum]);
2866   if (fnum) check_numeric_string(&FIELD[fnum]);
2867   if (fnum) rebuild_field0();
2868   else build_fields();
2869 }
2870 
2871 // Fetching non-existent field gets uninit string value; no change to NF!
2872 // Called by tkfield op       // TODO inline it?
push_field(int fnum)2873 static void push_field(int fnum)
2874 {
2875   if (fnum < 0 || fnum > FIELDS_MAX) error_exit("bad field num %d", fnum);
2876   // Contrary to posix, awk evaluates TT.fields beyond $NF as empty strings.
2877   if (fnum > TT.nf_internal) push_val(&uninit_string_zvalue);
2878   else push_val(&FIELD[fnum]);
2879 }
2880 
2881 ////////////////////
2882 ////   END fields
2883 ////////////////////
2884 
2885 #define STKP    TT.stackp   // pointer to top of stack
2886 
seedrand(double seed)2887 static double seedrand(double seed)
2888 {
2889   static double prev_seed;
2890   double r = prev_seed;
2891   srandom(trunc(prev_seed = seed));
2892   return r;
2893 }
2894 
popnumval(void)2895 static int popnumval(void)
2896 {
2897   return STKP-- -> num;
2898 }
2899 
drop(void)2900 static void drop(void)
2901 {
2902   if (!(STKP->flags & (ZF_ANYMAP | ZF_RX))) zstring_release(&STKP->vst);
2903   STKP--;
2904 }
2905 
drop_n(int n)2906 static void drop_n(int n)
2907 {
2908   while (n--) drop();
2909 }
2910 
swap(void)2911 static void swap(void)
2912 {
2913   struct zvalue tmp = STKP[-1];
2914   STKP[-1] = STKP[0];
2915   STKP[0] = tmp;
2916 }
2917 
2918 // Set and return logical (0/1) val of top TT.stack value; flag value as NUM.
get_set_logical(void)2919 static int get_set_logical(void)
2920 {
2921   struct zvalue *v = STKP;
2922   force_maybemap_to_scalar(v);
2923   int r = 0;
2924   if (IS_NUM(v)) r = !! v->num;
2925   else if (IS_STR(v)) r = (v->vst && v->vst->str[0]);
2926   zvalue_release_zstring(v);
2927   v->num = r;
2928   v->flags = ZF_NUM;
2929   return r;
2930 }
2931 
2932 
to_num(struct zvalue * v)2933 static double to_num(struct zvalue *v)
2934 {
2935   force_maybemap_to_scalar(v);
2936   if (v->flags & ZF_NUMSTR) zvalue_release_zstring(v);
2937   else if (!IS_NUM(v)) {
2938     v->num = 0.0;
2939     if (IS_STR(v) && v->vst) v->num = atof(v->vst->str);
2940     zvalue_release_zstring(v);
2941   }
2942   v->flags = ZF_NUM;
2943   return v->num;
2944 }
2945 
set_num(struct zvalue * v,double n)2946 static void set_num(struct zvalue *v, double n)
2947 {
2948   zstring_release(&v->vst);
2949   v->num = n;
2950   v->flags = ZF_NUM;
2951 }
2952 
incr_zvalue(struct zvalue * v)2953 static void incr_zvalue(struct zvalue *v)
2954 {
2955   v->num = trunc(to_num(v)) + 1;
2956 }
2957 
push_int_val(ptrdiff_t n)2958 static void push_int_val(ptrdiff_t n)
2959 {
2960   struct zvalue v = ZVINIT(ZF_NUM, n, 0);
2961   push_val(&v);
2962 }
2963 
get_map_val(struct zvalue * v,struct zvalue * key)2964 static struct zvalue *get_map_val(struct zvalue *v, struct zvalue *key)
2965 {
2966   struct zmap_slot *x = zmap_find_or_insert_key(v->map, to_str(key)->vst);
2967   return &x->val;
2968 }
2969 
setup_lvalue(int ref_stack_ptr,int parmbase,int * field_num)2970 static struct zvalue *setup_lvalue(int ref_stack_ptr, int parmbase, int *field_num)
2971 {
2972   // ref_stack_ptr is number of slots down in stack the ref is
2973   // for +=, *=, etc
2974   // Stack is: ... scalar_ref value_to_op_by
2975   // or ... subscript_val map_ref value_to_op_by
2976   // or ... fieldref value_to_op_by
2977   // for =, ++, --
2978   // Stack is: ... scalar_ref
2979   // or ... subscript_val map_ref
2980   // or ... fieldnum fieldref
2981   int k;
2982   struct zvalue *ref, *v = 0; // init v to mute "may be uninit" warning
2983   *field_num = -1;
2984   ref = STKP - ref_stack_ptr;
2985   if (ref->flags & ZF_FIELDREF) return get_field_ref(*field_num = ref->num);
2986   k = ref->num >= 0 ? ref->num : parmbase - ref->num;
2987   if (k == NF) *field_num = THIS_MEANS_SET_NF;
2988   v = &STACK[k];
2989   if (ref->flags & ZF_REF) {
2990     force_maybemap_to_scalar(v);
2991   } else if (ref->flags & ZF_MAPREF) {
2992     force_maybemap_to_map(v);
2993     if (!IS_MAP(v)) FATAL("scalar in array context");
2994     v = get_map_val(v, STKP - ref_stack_ptr - 1);
2995     swap();
2996     drop();
2997   } else FATAL("assignment to bad lvalue");
2998   return v; // order FATAL() and return to mute warning
2999 }
3000 
3001 
new_file(char * fn,FILE * fp,char mode,char file_or_pipe)3002 static struct zfile *new_file(char *fn, FILE *fp, char mode, char file_or_pipe)
3003 {
3004   struct zfile *f = xzalloc(sizeof(struct zfile));
3005   *f = (struct zfile){TT.zfiles, xstrdup(fn), fp, mode, file_or_pipe,
3006                         0, 0, 0, 0, 0, 0, 0, 0, 0};
3007   return TT.zfiles = f;
3008 }
3009 
fflush_all(void)3010 static int fflush_all(void)
3011 {
3012   int ret = 0;
3013   for (struct zfile *p = TT.zfiles; p; p = p->next)
3014     if (fflush(p->fp)) ret = -1;
3015   return ret;
3016 }
3017 
fflush_file(int nargs)3018 static int fflush_file(int nargs)
3019 {
3020   if (!nargs) return fflush_all();
3021 
3022   to_str(STKP);   // filename at top of TT.stack
3023   // Null string means flush all
3024   if (!STKP[0].vst->str[0]) return fflush_all();
3025 
3026   // is it open in file table?
3027   for (struct zfile *p = TT.zfiles; p; p = p->next)
3028     if (!strcmp(STKP[0].vst->str, p->fn))
3029       if (!fflush(p->fp)) return 0;
3030   return -1;    // error, or file not found in table
3031 }
close_file(char * fn)3032 static int close_file(char *fn)
3033 {
3034   // !fn (null ptr) means close all (exc. stdin/stdout/stderr)
3035   int r = 0;
3036   struct zfile *np, **pp = &TT.zfiles;
3037   for (struct zfile *p = TT.zfiles; p; p = np) {
3038     np = p->next;   // save in case unlinking file (invalidates p->next)
3039     // Don't close std files -- wrecks print/printf (can be fixed though TODO)
3040     if ((!p->is_std_file) && (!fn || !strcmp(fn, p->fn))) {
3041       xfree(p->recbuf);
3042       xfree(p->recbuf_multi);
3043       xfree(p->recbuf_multx);
3044       xfree(p->fn);
3045       r = (p->fp) ? (p->file_or_pipe ? fclose : pclose)(p->fp) : -1;
3046       *pp = p->next;
3047       xfree(p);
3048       if (fn) return r;
3049     } else pp = &p->next; // only if not unlinking zfile
3050   }
3051   return -1;  // file not in table, or closed all files
3052 }
3053 
3054 static struct zfile badfile_obj, *badfile = &badfile_obj;
3055 
3056 // FIXME TODO check if file/pipe/mode matches what's in the table already.
3057 // Apparently gawk/mawk/nawk are OK with different mode, but just use the file
3058 // in whatever mode it's already in; i.e. > after >> still appends.
setup_file(char file_or_pipe,char * mode)3059 static struct zfile *setup_file(char file_or_pipe, char *mode)
3060 {
3061   to_str(STKP);   // filename at top of TT.stack
3062   char *fn = STKP[0].vst->str;
3063   // is it already open in file table?
3064   for (struct zfile *p = TT.zfiles; p; p = p->next)
3065     if (!strcmp(fn, p->fn)) {
3066       drop();
3067       return p;   // open; return it
3068     }
3069   FILE *fp = (file_or_pipe ? fopen : popen)(fn, mode);
3070   if (fp) {
3071     struct zfile *p = new_file(fn, fp, *mode, file_or_pipe);
3072     drop();
3073     return p;
3074   }
3075   if (*mode != 'r') FFATAL("cannot open '%s'\n", fn);
3076   drop();
3077   return badfile;
3078 }
3079 
3080 // TODO FIXME should be a function?
3081 #define stkn(n) ((int)(TT.stackp - (n) - (struct zvalue *)TT.stack.base))
3082 
getcnt(int k)3083 static int getcnt(int k)
3084 {
3085   if (k >= stkn(0)) FATAL("too few args for printf\n");
3086   return (int)to_num(&STACK[k]);
3087 }
3088 
fsprintf(FILE * ignored,const char * fmt,...)3089 static int fsprintf(FILE *ignored, const char *fmt, ...)
3090 {
3091   (void)ignored;
3092   va_list args, args2;
3093   va_start(args, fmt);
3094   va_copy(args2, args);
3095   int len = vsnprintf(0, 0, fmt, args); // size needed
3096   va_end(args);
3097   if (len < 0) FATAL("Bad sprintf format");
3098   // Unfortunately we have to mess with zstring internals here.
3099   if (TT.rgl.zspr->size + len + 1 > TT.rgl.zspr->capacity) {
3100       // This should always work b/c capacity > size
3101       unsigned cap = 2 * TT.rgl.zspr->capacity + len;
3102       TT.rgl.zspr = xrealloc(TT.rgl.zspr, sizeof(*TT.rgl.zspr) + cap);
3103       TT.rgl.zspr->capacity = cap;
3104     }
3105   vsnprintf(TT.rgl.zspr->str + TT.rgl.zspr->size, len+1, fmt, args2);
3106   TT.rgl.zspr->size += len;
3107   TT.rgl.zspr->str[TT.rgl.zspr->size] = 0;
3108   va_end(args2);
3109   return 0;
3110 }
3111 
varprint(int (* fpvar)(FILE *,const char *,...),FILE * outfp,int nargs)3112 static void varprint(int(*fpvar)(FILE *, const char *, ...), FILE *outfp, int nargs)
3113 {
3114   int k, nn, nnc, fmtc, holdc, cnt1 = 0, cnt2 = 0;
3115   char *s = 0;  // to shut up spurious warning
3116   regoff_t offs = -1, e = -1;
3117   char *pfmt, *fmt = to_str(STKP-nargs+1)->vst->str;
3118   k = stkn(nargs - 2);
3119   while (*fmt) {
3120     double n = 0;
3121     nn = strcspn(fmt, "%");
3122     if (nn) {
3123       holdc = fmt[nn];
3124       fmt[nn] = 0;
3125       fpvar(outfp, "%s", fmt);
3126       fmt[nn] = holdc;
3127     }
3128     fmt += nn;
3129     if (!*(pfmt = fmt)) break;
3130     nnc = strcspn(fmt+1, "aAdiouxXfFeEgGcs%");
3131     fmtc = fmt[nnc+1];
3132     if (!fmtc) FFATAL("bad printf format '%s'", fmt);
3133     holdc = fmt[nnc+2];
3134     fmt[nnc+2] = 0;
3135     if (rx_find(&TT.rx_printf_fmt, fmt, &offs, &e, 0))
3136       FFATAL("bad printf format <%s>\n", fmt);
3137     int nargsneeded = 1;
3138     for (char *p = strchr(fmt, '*'); p; p = strchr(p+1, '*'))
3139       nargsneeded++;
3140     nargsneeded -= fmtc == '%';
3141 
3142     switch (nargsneeded) {
3143       case 0:
3144         fpvar(outfp, fmt);
3145         break;
3146       case 3:
3147         cnt1 = getcnt(k++);
3148         ATTR_FALLTHROUGH_INTENDED;
3149       case 2:
3150         cnt2 = getcnt(k++);
3151         ATTR_FALLTHROUGH_INTENDED;
3152       case 1:
3153         if (k > stkn(0)) FATAL("too few args for printf\n");
3154         if (fmtc == 's') {
3155           s = to_str(&STACK[k++])->vst->str;
3156         } else if (fmtc == 'c' && !IS_NUM(&STACK[k])) {
3157           unsigned wch;
3158           struct zvalue *z = &STACK[k++];
3159           if (z->vst && z->vst->str[0])
3160             n = utf8towc(&wch, z->vst->str, z->vst->size) < 1 ? 0xfffd : wch;
3161         } else {
3162           n = to_num(&STACK[k++]);
3163         }
3164         if (strchr("cdiouxX", fmtc)) {
3165           pfmt = strcpy(TT.pbuf, fmt);
3166           if (pfmt[nnc] != 'l') {
3167             strcpy(pfmt+nnc+1, "l_");
3168             pfmt[nnc+2] = fmtc;
3169           }
3170         }
3171         if (fmtc == 'c' && n > 0x10ffff) n = 0xfffd;  // musl won't take larger "wchar"
3172         switch (nargsneeded) {
3173           case 1:
3174             if (fmtc == 's') fpvar(outfp, pfmt, s);
3175             else if (fmtc == 'c') fpvar(outfp, pfmt, (wint_t)n);
3176             else if (strchr("di", fmtc)) fpvar(outfp, pfmt, (long)n);
3177             else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, (unsigned long)n);
3178             else fpvar(outfp, pfmt, n);
3179             break;
3180           case 2:
3181             if (fmtc == 's') fpvar(outfp, pfmt, cnt2, s);
3182             else if (fmtc == 'c') fpvar(outfp, pfmt, cnt2, (wint_t)n);
3183             else if (strchr("di", fmtc)) fpvar(outfp, pfmt, cnt2, (long)n);
3184             else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, cnt2, (unsigned long)n);
3185             else fpvar(outfp, pfmt, cnt2, n);
3186             break;
3187           case 3:
3188             if (fmtc == 's') fpvar(outfp, pfmt, cnt1, cnt2, s);
3189             else if (fmtc == 'c') fpvar(outfp, pfmt, cnt1, cnt2, (wint_t)n);
3190             else if (strchr("di", fmtc)) fpvar(outfp, pfmt, cnt1, cnt2, (long)n);
3191             else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, cnt1, cnt2, (unsigned long)n);
3192             else fpvar(outfp, pfmt, cnt1, cnt2, n);
3193             break;
3194         }
3195         break;
3196       default:
3197         FATAL("bad printf format\n");
3198     }
3199     fmt += nnc + 2;
3200     *fmt = holdc;
3201   }
3202 }
3203 
is_ok_varname(char * v)3204 static int is_ok_varname(char *v)
3205 {
3206   char *ok = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_";
3207   if (!*v) return 0;
3208   for (int i = 0; v[i]; i++)
3209     if (i ? !strchr(ok, v[i]) : !strchr(ok + 10, v[i])) return 0;
3210   return 1;
3211 }
3212 
3213 // FIXME TODO return value never used. What if assign to var not in globals?
assign_global(char * var,char * value)3214 static int assign_global(char *var, char *value)
3215 {
3216   if (!is_ok_varname(var)) FFATAL("Invalid variable name '%s'\n", var);
3217   int globals_ent = find_global(var);
3218   if (globals_ent) {
3219     struct zvalue *v = &STACK[globals_ent];
3220     if (IS_MAP(v)) error_exit("-v assignment to array");  // Maybe not needed?
3221 
3222 // The compile phase may insert a var in global table with flag of zero.  Then
3223 // init_globals() will assign a ZF_MAYBEMAP flag to it. If it is then assigned
3224 // via -v option or by assignment_arg() it will here be assigned a string value.
3225 // So first, remove all map data to prevent memory leak. BUG FIX // 2024-02-13.
3226     if (v->flags & ZF_ANYMAP) {
3227       zmap_delete_map_incl_slotdata(v->map);
3228       xfree(v->map);
3229       v->map = 0;
3230       v->flags &= ~ZF_ANYMAP;
3231     }
3232 
3233     zvalue_release_zstring(v);
3234     value = xstrdup(value);
3235     *v = new_str_val(escape_str(value, 0));
3236     xfree(value);
3237     check_numeric_string(v);
3238     return 1;
3239   }
3240   return 0;
3241 }
3242 
3243 // If valid assignment arg, assign the global and return 1;
3244 // otherwise return 0.
3245 // TODO FIXME This does not check the format of the variable per posix.
3246 // Needs to start w/ _A-Za-z then _A-Za-z0-9
3247 // If not valid assignment form, then nextfilearg needs to treat as filename.
assignment_arg(char * arg)3248 static int assignment_arg(char *arg)
3249 {
3250   char *val = strchr(arg, '=');
3251   if (val) {
3252     *val++ = 0;
3253     if (!is_ok_varname(arg)) {
3254       *--val = '=';
3255       return 0;
3256     }
3257     assign_global(arg, val);
3258     *--val = '=';
3259     return 1;
3260   } else return 0;
3261 }
3262 
nextfilearg(void)3263 static char *nextfilearg(void)
3264 {
3265   char *arg;
3266   do {
3267     if (++TT.rgl.narg >= (int)to_num(&STACK[ARGC])) return 0;
3268     struct zvalue *v = &STACK[ARGV];
3269     struct zvalue zkey = ZVINIT(ZF_STR, 0,
3270         num_to_zstring(TT.rgl.narg, to_str(&STACK[CONVFMT])->vst->str));
3271     arg = "";
3272     if (zmap_find(v->map, zkey.vst)) {
3273       zvalue_copy(&TT.rgl.cur_arg, to_str(get_map_val(v, &zkey)));
3274       arg = TT.rgl.cur_arg.vst->str;
3275     }
3276     zvalue_release_zstring(&zkey);
3277   } while (!*arg || assignment_arg(arg));
3278   TT.rgl.nfiles++;
3279   return arg;
3280 }
3281 
next_fp(void)3282 static int next_fp(void)
3283 {
3284   char *fn = nextfilearg();
3285   if (TT.cfile->fp && TT.cfile->fp != stdin) fclose(TT.cfile->fp);
3286   if ((!fn && !TT.rgl.nfiles && TT.cfile->fp != stdin) || (fn && !strcmp(fn, "-"))) {
3287     TT.cfile->fp = stdin;
3288     zvalue_release_zstring(&STACK[FILENAME]);
3289     STACK[FILENAME].vst = new_zstring("<stdin>", 7);
3290   } else if (fn) {
3291     if (!(TT.cfile->fp = fopen(fn, "r"))) FFATAL("can't open %s\n", fn);
3292     zvalue_copy(&STACK[FILENAME], &TT.rgl.cur_arg);
3293     set_num(&STACK[FNR], 0);
3294   } else {
3295     TT.rgl.eof = 1;
3296     return 0;
3297   }
3298   return 1;
3299 }
3300 
getrec_multiline(struct zfile * zfp)3301 static ssize_t getrec_multiline(struct zfile *zfp)
3302 {
3303   ssize_t k, kk;
3304   do {
3305     k = getdelim(&zfp->recbuf_multi, &zfp->recbufsize_multi, '\n', zfp->fp);
3306   } while (k > 0 && zfp->recbuf_multi[0] == '\n');
3307   TT.rgl.recptr = zfp->recbuf_multi;
3308   if (k < 0) return k;
3309   // k > 0 and recbuf_multi is not only a \n. Prob. ends w/ \n
3310   // but may not at EOF (last line w/o newline)
3311   for (;;) {
3312     kk = getdelim(&zfp->recbuf_multx, &zfp->recbufsize_multx, '\n', zfp->fp);
3313     if (kk < 0 || zfp->recbuf_multx[0] == '\n') break;
3314     // data is in zfp->recbuf_multi[0..k-1]; append to it
3315     if ((size_t)(k + kk + 1) > zfp->recbufsize_multi)
3316       zfp->recbuf_multi =
3317           xrealloc(zfp->recbuf_multi, zfp->recbufsize_multi = k + kk + 1);
3318     memmove(zfp->recbuf_multi + k, zfp->recbuf_multx, kk+1);
3319     k += kk;
3320   }
3321   if (k > 1 && zfp->recbuf_multi[k-1] == '\n') zfp->recbuf_multi[--k] = 0;
3322   TT.rgl.recptr = zfp->recbuf_multi;
3323   return k;
3324 }
3325 
rx_findx(regex_t * rx,char * s,long len,regoff_t * start,regoff_t * end,int eflags)3326 static int rx_findx(regex_t *rx, char *s, long len, regoff_t *start, regoff_t *end, int eflags)
3327 {
3328   regmatch_t matches[1];
3329   int r = regexec0(rx, s, len, 1, matches, eflags);
3330   if (r == REG_NOMATCH) return r;
3331   if (r) FATAL("regexec error");  // TODO ? use regerr() to meaningful msg
3332   *start = matches[0].rm_so;
3333   *end = matches[0].rm_eo;
3334   return 0;
3335 }
3336 
getrec_f(struct zfile * zfp)3337 static ssize_t getrec_f(struct zfile *zfp)
3338 {
3339   int r = 0, rs = ENSURE_STR(&STACK[RS])->vst->str[0] & 0xff;
3340   if (!rs) return getrec_multiline(zfp);
3341   regex_t rsrx, *rsrxp = &rsrx;
3342   // TEMP!! FIXME Need to cache and avoid too-frequent rx compiles
3343   rx_zvalue_compile(&rsrxp, &STACK[RS]);
3344   regoff_t so = 0, eo = 0;
3345   long ret = -1;
3346   for ( ;; ) {
3347     if (zfp->recoffs == zfp->endoffs) {
3348 #define INIT_RECBUF_LEN     8192
3349 #define RS_LENGTH_MARGIN    (INIT_RECBUF_LEN / 8)
3350       if (!zfp->recbuf)
3351         zfp->recbuf = xmalloc((zfp->recbufsize = INIT_RECBUF_LEN) + 1);
3352       zfp->endoffs = fread(zfp->recbuf, 1, zfp->recbufsize, zfp->fp);
3353       zfp->recoffs = 0;
3354       zfp->recbuf[zfp->endoffs] = 0;
3355       if (!zfp->endoffs) break;
3356     }
3357     TT.rgl.recptr = zfp->recbuf + zfp->recoffs;
3358     r = rx_findx(rsrxp, TT.rgl.recptr, zfp->endoffs - zfp->recoffs, &so, &eo, 0);
3359     // if not found, or found "near" end of buffer...
3360     if (r || zfp->recoffs + eo > (int)zfp->recbufsize - RS_LENGTH_MARGIN) {
3361       // if at end of data, and (not found or found at end of data)
3362       if (zfp->endoffs < (int)zfp->recbufsize &&
3363           (r || zfp->recoffs + eo == zfp->endoffs)) {
3364         ret = zfp->endoffs - zfp->recoffs;
3365         zfp->recoffs = zfp->endoffs;
3366         break;
3367       }
3368       if (zfp->recoffs) {
3369         memmove(zfp->recbuf, TT.rgl.recptr, zfp->endoffs - zfp->recoffs);
3370         zfp->endoffs -= zfp->recoffs;
3371         zfp->recoffs = 0;
3372       } else zfp->recbuf =
3373         xrealloc(zfp->recbuf, (zfp->recbufsize = zfp->recbufsize * 3 / 2) + 1);
3374       zfp->endoffs += fread(zfp->recbuf + zfp->endoffs,
3375                       1, zfp->recbufsize - zfp->endoffs, zfp->fp);
3376       zfp->recbuf[zfp->endoffs] = 0;
3377     } else {
3378       // found and not too near end of data
3379       ret = so;
3380       TT.rgl.recptr[so] = 0;
3381       zfp->recoffs += eo;
3382       break;
3383     }
3384   }
3385   regfree(rsrxp);
3386   return ret;
3387 }
3388 
getrec(void)3389 static ssize_t getrec(void)
3390 {
3391   ssize_t k;
3392   if (TT.rgl.eof) return -1;
3393   if (!TT.cfile->fp) next_fp();
3394   do {
3395     if ((k = getrec_f(TT.cfile)) >= 0) return k;
3396   } while (next_fp());
3397   return -1;
3398 }
3399 
getrec_f0_f(struct zfile * zfp)3400 static ssize_t getrec_f0_f(struct zfile *zfp)
3401 {
3402   ssize_t k = getrec_f(zfp);
3403   if (k >= 0) {
3404     copy_to_field0(TT.rgl.recptr, k);
3405   }
3406   return k;
3407 }
3408 
getrec_f0(void)3409 static ssize_t getrec_f0(void)
3410 {
3411   ssize_t k = getrec();
3412   if (k >= 0) {
3413     copy_to_field0(TT.rgl.recptr, k);
3414     incr_zvalue(&STACK[NR]);
3415     incr_zvalue(&STACK[FNR]);
3416   }
3417   return k;
3418 }
3419 
3420 // source is tkeof (no pipe/file), tklt (file), or tkpipe (pipe)
3421 // fp is file or pipe (is NULL if file/pipe could not be opened)
3422 // FIXME TODO should -1 return be replaced by test at caller?
3423 // v is NULL or an lvalue ref
awk_getline(int source,struct zfile * zfp,struct zvalue * v)3424 static int awk_getline(int source, struct zfile *zfp, struct zvalue *v)
3425 {
3426   ssize_t k;
3427   int is_stream = source != tkeof;
3428   if (is_stream && !zfp->fp) return -1;
3429   if (v) {
3430     if ((k = is_stream ? getrec_f(zfp) : getrec()) < 0) return 0;
3431     zstring_release(&v->vst);
3432     v->vst = new_zstring(TT.rgl.recptr, k);
3433     v->flags = ZF_STR;
3434     check_numeric_string(v);    // bug fix 20240514
3435     if (!is_stream) {
3436       incr_zvalue(&STACK[NR]);
3437       incr_zvalue(&STACK[FNR]);
3438     }
3439   } else k = is_stream ? getrec_f0_f(zfp) : getrec_f0();
3440   return k < 0 ? 0 : 1;
3441 }
3442 
3443 // Define GAWK_SUB to get the same behavior with sub()/gsub() replacement text
3444 // as with gawk, goawk, and recent bwk awk (nawk) versions. Undefine GAWK_SUB
3445 // to get the simpler POSIX behavior, but I think most users will prefer the
3446 // gawk behavior. See the gawk (GNU Awk) manual,
3447 // sec. 9.1.4.1 // More about '\' and '&' with sub(), gsub(), and gensub()
3448 // for details on the differences.
3449 //
3450 #undef GAWK_SUB
3451 #define GAWK_SUB
3452 
3453 // sub(ere, repl[, in]) Substitute the string repl in place of the
3454 // first instance of the extended regular expression ERE in string 'in'
3455 // and return the number of substitutions.  An <ampersand> ( '&' )
3456 // appearing in the string repl shall be replaced by the string from in
3457 // that matches the ERE. (partial spec... there's more)
gsub(int opcode,int nargs,int parmbase)3458 static void gsub(int opcode, int nargs, int parmbase)
3459 { (void)nargs;
3460   int field_num = -1;
3461   // compile ensures 3 args
3462   struct zvalue *v = setup_lvalue(0, parmbase, &field_num);
3463   struct zvalue *ere = STKP-2;
3464   struct zvalue *repl = STKP-1;
3465   regex_t rx, *rxp = &rx;
3466   rx_zvalue_compile(&rxp, ere);
3467   to_str(repl);
3468   to_str(v);
3469 
3470 #define SLEN(zvalp) ((zvalp)->vst->size)
3471   char *p, *rp0 = repl->vst->str, *rp = rp0, *s = v->vst->str;
3472   int namps = 0, nhits = 0, is_sub = (opcode == tksub), eflags = 0;
3473   regoff_t so = -1, eo;
3474   // Count ampersands in repl string; may be overcount due to \& escapes.
3475   for (rp = rp0; *rp; rp++) namps += *rp == '&';
3476   p = s;
3477   regoff_t need = SLEN(v) + 1;  // capacity needed for result string
3478   // A pass just to determine needed destination (result) string size.
3479   while(!rx_find(rxp, p, &so, &eo, eflags)) {
3480     need += SLEN(repl) + (eo - so) * (namps - 1);
3481     if (!*p) break;
3482     p += eo ? eo : 1; // ensure progress if empty hit at start
3483     if (is_sub) break;
3484     eflags |= REG_NOTBOL;
3485   }
3486 
3487   if (so >= 0) {  // at least one hit
3488     struct zstring *z = xzalloc(sizeof(*z) + need);
3489     z->capacity = need;
3490 
3491     char *e = z->str; // result destination pointer
3492     p = s;
3493     eflags = 0;
3494     char *ep0 = p, *sp, *ep;
3495     while(!rx_find(rxp, p, &so, &eo, eflags)) {
3496       sp = p + so;
3497       ep = p + eo;
3498       memmove(e, ep0, sp - ep0);  // copy unchanged part
3499       e += sp - ep0;
3500       // Skip match if not at start and just after prev match and this is empty
3501       if (p == s || sp - ep0 || eo - so) {
3502         nhits++;
3503         for (rp = rp0; *rp; rp++) { // copy replacement
3504           if (*rp == '&') {
3505             memmove(e, sp, eo - so);  //copy match
3506             e += eo - so;
3507           } else if (*rp == '\\') {
3508             if (rp[1] == '&') *e++ = *++rp;
3509             else if (rp[1] != '\\') *e++ = *rp;
3510             else {
3511 #ifdef GAWK_SUB
3512               if (rp[2] == '\\' && rp[3] == '&') {
3513                 rp += 2;
3514                 *e++ = *rp;
3515               } else if (rp[2] != '&') *e++ = '\\';
3516 #endif
3517               *e++ = *++rp;
3518             }
3519           } else *e++ = *rp;
3520         }
3521       }
3522       ep0 = ep;
3523       if (!*p) break;
3524       p += eo ? eo : 1; // ensure progress if empty hit at start
3525       if (is_sub) break;
3526       eflags |= REG_NOTBOL;
3527     }
3528     // copy remaining subject string
3529     memmove(e, ep0, s + SLEN(v) - ep0);
3530     e += s + SLEN(v) - ep0;
3531     *e = 0;
3532     z->size = e - z->str;
3533     zstring_release(&v->vst);
3534     v->vst = z;
3535   }
3536   rx_zvalue_free(rxp, ere);
3537   if (!IS_RX(STKP-2)) zstring_release(&STKP[-2].vst);
3538   drop_n(3);
3539   push_int_val(nhits);
3540   if (field_num >= 0) fixup_fields(field_num);
3541 }
3542 
millinow(void)3543 static long millinow(void)
3544 {
3545   struct timespec ts;
3546   clock_gettime(CLOCK_REALTIME, &ts);
3547   return ts.tv_sec*1000+ts.tv_nsec/1000000;
3548 }
3549 
3550 // Initially set stackp_needmore at MIN_STACK_LEFT before limit.
3551 // When stackp > stackp_needmore, then expand and reset stackp_needmore
add_stack(struct zvalue ** stackp_needmore)3552 static void add_stack(struct zvalue **stackp_needmore)
3553 {
3554   int k = stkn(0);  // stack elements in use
3555   zlist_expand(&TT.stack);
3556   STKP = (struct zvalue *)TT.stack.base + k;
3557   *stackp_needmore = (struct zvalue *)TT.stack.limit - MIN_STACK_LEFT;
3558 }
3559 
3560 #define CLAMP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
3561 
3562 // Main loop of interpreter. Run this once for all BEGIN rules (which
3563 // have had their instructions chained in compile), all END rules (also
3564 // chained in compile), and once for each record of the data file(s).
interpx(int start,int * status)3565 static int interpx(int start, int *status)
3566 {
3567   int *ip = &ZCODE[start];
3568   int opcode, op2, k, r, nargs, nsubscrs, range_num, parmbase = 0;
3569   int field_num;
3570   double nleft, nright, d;
3571   double (*mathfunc[])(double) = {cos, sin, exp, log, sqrt, trunc};
3572   struct zvalue *v, vv,
3573         *stackp_needmore = (struct zvalue*)TT.stack.limit - MIN_STACK_LEFT;
3574   while ((opcode = *ip++)) {
3575 
3576     switch (opcode) {
3577       case opquit:
3578         return opquit;
3579 
3580       case tknot:
3581         (STKP)->num = ! get_set_logical();
3582         break;
3583 
3584       case opnotnot:
3585         get_set_logical();
3586         break;
3587 
3588       case opnegate:
3589         STKP->num = -to_num(STKP);
3590         break;
3591 
3592       case tkpow:         // FALLTHROUGH intentional here
3593       case tkmul:         // FALLTHROUGH intentional here
3594       case tkdiv:         // FALLTHROUGH intentional here
3595       case tkmod:         // FALLTHROUGH intentional here
3596       case tkplus:        // FALLTHROUGH intentional here
3597       case tkminus:
3598         nleft = to_num(STKP-1);
3599         nright = to_num(STKP);
3600         switch (opcode) {
3601           case tkpow: nleft = pow(nleft, nright); break;
3602           case tkmul: nleft *= nright; break;
3603           case tkdiv: nleft /= nright; break;
3604           case tkmod: nleft = fmod(nleft, nright); break;
3605           case tkplus: nleft += nright; break;
3606           case tkminus: nleft -= nright; break;
3607         }
3608         drop();
3609         STKP->num = nleft;
3610         break;
3611 
3612       // FIXME REDO REDO ?
3613       case tkcat:
3614         to_str(STKP-1);
3615         to_str(STKP);
3616         STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP[0].vst);
3617         drop();
3618         break;
3619 
3620         // Comparisons (with the '<', "<=", "!=", "==", '>', and ">="
3621         // operators) shall be made numerically if both operands are numeric,
3622         // if one is numeric and the other has a string value that is a numeric
3623         // string, or if one is numeric and the other has the uninitialized
3624         // value. Otherwise, operands shall be converted to strings as required
3625         // and a string comparison shall be made as follows:
3626         //
3627         // For the "!=" and "==" operators, the strings should be compared to
3628         // check if they are identical but may be compared using the
3629         // locale-specific collation sequence to check if they collate equally.
3630         //
3631         // For the other operators, the strings shall be compared using the
3632         // locale-specific collation sequence.
3633         //
3634         // The value of the comparison expression shall be 1 if the relation is
3635         // true, or 0 if the relation is false.
3636       case tklt:          // FALLTHROUGH intentional here
3637       case tkle:          // FALLTHROUGH intentional here
3638       case tkne:          // FALLTHROUGH intentional here
3639       case tkeq:          // FALLTHROUGH intentional here
3640       case tkgt:          // FALLTHROUGH intentional here
3641       case tkge:
3642         ; int cmp = 31416;
3643 
3644         if (  (IS_NUM(&STKP[-1]) &&
3645               (STKP[0].flags & (ZF_NUM | ZF_NUMSTR) || !STKP[0].flags)) ||
3646               (IS_NUM(&STKP[0]) &&
3647               (STKP[-1].flags & (ZF_NUM | ZF_NUMSTR) || !STKP[-1].flags))) {
3648           switch (opcode) {
3649             case tklt: cmp = STKP[-1].num < STKP[0].num; break;
3650             case tkle: cmp = STKP[-1].num <= STKP[0].num; break;
3651             case tkne: cmp = STKP[-1].num != STKP[0].num; break;
3652             case tkeq: cmp = STKP[-1].num == STKP[0].num; break;
3653             case tkgt: cmp = STKP[-1].num > STKP[0].num; break;
3654             case tkge: cmp = STKP[-1].num >= STKP[0].num; break;
3655           }
3656         } else {
3657           cmp = strcmp(to_str(STKP-1)->vst->str, to_str(STKP)->vst->str);
3658           switch (opcode) {
3659             case tklt: cmp = cmp < 0; break;
3660             case tkle: cmp = cmp <= 0; break;
3661             case tkne: cmp = cmp != 0; break;
3662             case tkeq: cmp = cmp == 0; break;
3663             case tkgt: cmp = cmp > 0; break;
3664             case tkge: cmp = cmp >= 0; break;
3665           }
3666         }
3667         drop();
3668         drop();
3669         push_int_val(cmp);
3670         break;
3671 
3672       case opmatchrec:
3673         op2 = *ip++;
3674         int mret = match(&FIELD[0], &LITERAL[op2]);
3675         push_int_val(!mret);
3676         break;
3677 
3678       case tkmatchop:
3679       case tknotmatch:
3680         mret = match(STKP-1, STKP); // mret == 0 if match
3681         drop();
3682         drop();
3683         push_int_val(!mret == (opcode == tkmatchop));
3684         break;
3685 
3686       case tkpowasgn:     // FALLTHROUGH intentional here
3687       case tkmodasgn:     // FALLTHROUGH intentional here
3688       case tkmulasgn:     // FALLTHROUGH intentional here
3689       case tkdivasgn:     // FALLTHROUGH intentional here
3690       case tkaddasgn:     // FALLTHROUGH intentional here
3691       case tksubasgn:
3692         // Stack is: ... scalar_ref value_to_op_by
3693         // or ... subscript_val map_ref value_to_op_by
3694         // or ... fieldref value_to_op_by
3695         v = setup_lvalue(1, parmbase, &field_num);
3696         to_num(v);
3697         to_num(STKP);
3698         switch (opcode) {
3699           case tkpowasgn:
3700             // TODO
3701             v->num = pow(v->num, STKP->num);
3702             break;
3703           case tkmodasgn:
3704             // TODO
3705             v->num = fmod(v->num, STKP->num);
3706             break;
3707           case tkmulasgn:
3708             v->num *= STKP->num;
3709             break;
3710           case tkdivasgn:
3711             v->num /= STKP->num;
3712             break;
3713           case tkaddasgn:
3714             v->num += STKP->num;
3715             break;
3716           case tksubasgn:
3717             v->num -= STKP->num;
3718             break;
3719         }
3720 
3721         drop_n(2);
3722         v->flags = ZF_NUM;
3723         push_val(v);
3724         if (field_num >= 0) fixup_fields(field_num);
3725         break;
3726 
3727       case tkasgn:
3728         // Stack is: ... scalar_ref value_to_assign
3729         // or ... subscript_val map_ref value_to_assign
3730         // or ... fieldref value_to_assign
3731         v = setup_lvalue(1, parmbase, &field_num);
3732         force_maybemap_to_scalar(STKP);
3733         zvalue_copy(v, STKP);
3734         swap();
3735         drop();
3736         if (field_num >= 0) fixup_fields(field_num);
3737         break;
3738 
3739       case tkincr:        // FALLTHROUGH intentional here
3740       case tkdecr:        // FALLTHROUGH intentional here
3741       case oppreincr:     // FALLTHROUGH intentional here
3742       case oppredecr:
3743         // Stack is: ... scalar_ref
3744         // or ... subscript_val map_ref
3745         // or ... fieldnum fieldref
3746         v = setup_lvalue(0, parmbase, &field_num);
3747         to_num(v);
3748         switch (opcode) {
3749           case tkincr: case tkdecr:
3750             // Must be done in this order because push_val(v) may move v,
3751             // invalidating the pointer.
3752             v->num += (opcode == tkincr) ? 1 : -1;
3753             push_val(v);
3754             // Now reverse the incr/decr on the top TT.stack val.
3755             STKP->num -= (opcode == tkincr) ? 1 : -1;
3756             break;
3757           case oppreincr: case oppredecr:
3758             v->num += (opcode == oppreincr) ? 1 : -1;
3759             push_val(v);
3760             break;
3761         }
3762         swap();
3763         drop();
3764         if (field_num >= 0) fixup_fields(field_num);
3765         break;
3766 
3767       case tknumber:      // FALLTHROUGH intentional here
3768       case tkstring:      // FALLTHROUGH intentional here
3769       case tkregex:
3770         push_val(&LITERAL[*ip++]);
3771         break;
3772 
3773       case tkprint:
3774       case tkprintf:
3775         nargs = *ip++;
3776         int outmode = *ip++;
3777         struct zfile *outfp = TT.zstdout;
3778         switch (outmode) {
3779           case tkgt: outfp = setup_file(1, "w"); break;     // file
3780           case tkappend: outfp = setup_file(1, "a"); break; // file
3781           case tkpipe: outfp = setup_file(0, "w"); break;   // pipe
3782           default: nargs++; break;
3783         }
3784         nargs--;
3785         if (opcode == tkprintf) {
3786           varprint(fprintf, outfp->fp, nargs);
3787           drop_n(nargs);
3788           break;
3789         }
3790         if (!nargs) {
3791           fprintf(outfp->fp, "%s", to_str(&FIELD[0])->vst->str);
3792         } else {
3793           struct zvalue tempv = uninit_zvalue;
3794           zvalue_copy(&tempv, &STACK[OFS]);
3795           to_str(&tempv);
3796           for (int k = 0; k < nargs; k++) {
3797             if (k) fprintf(outfp->fp, "%s", tempv.vst->str);
3798             int sp = stkn(nargs - 1 - k);
3799             ////// FIXME refcnt -- prob. don't need to copy from TT.stack?
3800             v = &STACK[sp];
3801             to_str_fmt(v, OFMT);
3802             struct zstring *zs = v->vst;
3803             fprintf(outfp->fp, "%s", zs ? zs->str : "");
3804           }
3805           zvalue_release_zstring(&tempv);
3806           drop_n(nargs);
3807         }
3808         fputs(ENSURE_STR(&STACK[ORS])->vst->str, outfp->fp);
3809         break;
3810 
3811       case opdrop:
3812         drop();
3813         break;
3814 
3815       case opdrop_n:
3816         drop_n(*ip++);
3817         break;
3818 
3819         // Stack frame layout relative to parmbase:
3820 #define RETURN_VALUE    -4
3821 #define RETURN_ADDR     -3
3822 #define PREV_PARMBASE   -2
3823 #define ARG_CNT         -1
3824 #define FUNCTION_NUM    0
3825         // Actual args follow, starting at parmbase + 1
3826       case tkfunction:    // function definition
3827         op2 = *ip++;    // func table num
3828         struct functab_slot *pfdef = &FUNC_DEF[op2];
3829         struct zlist *loctab = &pfdef->function_locals;
3830         int nparms = zlist_len(loctab)-1;
3831 
3832         nargs = popnumval();
3833         int newparmbase = stkn(nargs);
3834         STACK[newparmbase + PREV_PARMBASE].num = parmbase;
3835         parmbase = newparmbase;
3836         for ( ;nargs > nparms; nargs--)
3837           drop();
3838         for ( ;nargs < nparms; nargs++) {
3839           // Push additional "args" that were not passed by the caller, to
3840           // match the formal parameters (parms) defined in the function
3841           // definition. In the local var table we may have the type as scalar
3842           // or map if it is used as such within the function. In that case we
3843           // init the pushed arg from the type of the locals table.
3844           // But if a var appears only as a bare arg in a function call it will
3845           // not be typed in the locals table. In that case we can only say it
3846           // "may be" a map, but we have to assume the possibility and attach a
3847           // map to the var. When/if the var is used as a map or scalar in the
3848           // called function it will be converted to a map or scalar as
3849           // required.
3850           // See force_maybemap_to_scalar().
3851           struct symtab_slot *q = &((struct symtab_slot *)loctab->base)[nargs+1];
3852           vv = (struct zvalue)ZVINIT(q->flags, 0, 0);
3853           if (vv.flags == 0) {
3854             zvalue_map_init(&vv);
3855             vv.flags = ZF_MAYBEMAP;
3856           } else if (IS_MAP(&vv)) {
3857             zvalue_map_init(&vv);
3858           } else {
3859             vv.flags = 0;
3860           }
3861           push_val(&vv);
3862         }
3863         break;
3864 
3865       case tkreturn:
3866         nparms = *ip++;
3867         nargs = STACK[parmbase+ARG_CNT].num;
3868         force_maybemap_to_scalar(STKP); // Unneeded?
3869         zvalue_copy(&STACK[parmbase+RETURN_VALUE], STKP);
3870         drop();
3871         // Remove the local args (not supplied by caller) from TT.stack, check to
3872         // release any map data created.
3873         while (stkn(0) > parmbase + nargs) {
3874           if ((STKP)->flags & ZF_ANYMAP) {
3875             zmap_delete_map_incl_slotdata((STKP)->map);
3876             xfree((STKP)->map);
3877           }
3878           drop();
3879         }
3880         while (stkn(0) > parmbase + RETURN_VALUE)
3881           drop();
3882         ip = &ZCODE[(int)STACK[parmbase+RETURN_ADDR].num];
3883         parmbase = STACK[parmbase+PREV_PARMBASE].num;
3884         break;
3885 
3886       case opprepcall:    // function call prep
3887         if (STKP > stackp_needmore) add_stack(&stackp_needmore);
3888         push_int_val(0);      // return value placeholder
3889         push_int_val(0);      // return addr
3890         push_int_val(0);      // parmbase
3891         push_int_val(0);      // arg count
3892         push_int_val(*ip++);  // function tbl ref
3893         break;
3894 
3895       case tkfunc:        // function call
3896         nargs = *ip++;
3897         newparmbase = stkn(nargs);
3898         STACK[newparmbase+RETURN_ADDR].num = ip - &ZCODE[0];
3899         STACK[newparmbase+ARG_CNT].num = nargs;
3900         push_int_val(nargs);      // FIXME TODO pass this in a zregister?
3901         ip = &ZCODE[FUNC_DEF[(int)STACK[newparmbase+FUNCTION_NUM].num].zcode_addr];
3902         break;
3903 
3904       case tkrbracket:    // concat multiple map subscripts
3905         nsubscrs = *ip++;
3906         while (--nsubscrs) {
3907           swap();
3908           to_str(STKP);
3909           push_val(&STACK[SUBSEP]);
3910           to_str(STKP);
3911           STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP->vst);
3912           drop();
3913           swap();
3914           to_str(STKP);
3915           STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP->vst);
3916           drop();
3917         }
3918         break;
3919 
3920       case opmapdelete:
3921       case tkdelete:
3922         k = STKP->num;
3923         if (k < 0) k = parmbase - k;    // loc of var on TT.stack
3924         v = &STACK[k];
3925         force_maybemap_to_map(v);
3926         if (opcode == opmapdelete) {
3927           zmap_delete_map(v->map);
3928         } else {
3929           drop();
3930           zmap_delete(v->map, to_str(STKP)->vst);
3931         }
3932         drop();
3933         break;
3934 
3935       case opmap:
3936         op2 = *ip++;
3937         k = op2 < 0 ? parmbase - op2 : op2;
3938         v = &STACK[k];
3939         force_maybemap_to_map(v);
3940         if (!IS_MAP(v)) FATAL("scalar in array context");
3941         v = get_map_val(v, STKP);
3942         drop();     // drop subscript
3943         push_val(v);
3944         break;
3945 
3946       case tkin:
3947         if (!(STKP->flags & ZF_ANYMAP)) FATAL("scalar in array context");
3948         v = zmap_find(STKP->map, to_str(STKP-1)->vst);
3949         drop();
3950         drop();
3951         push_int_val(v ? 1 : 0);
3952         break;
3953 
3954       case opmapiternext:
3955         op2 = *ip++;
3956         v = STKP-1;
3957         force_maybemap_to_map(v);
3958         if (!IS_MAP(v)) FATAL("scalar in array context");
3959         struct zmap *m = v->map;   // Need for MAPSLOT macro
3960         int zlen = zlist_len(&m->slot);
3961         int kk = STKP->num + 1;
3962         while (kk < zlen && !(MAPSLOT[kk].key)) // skip deleted slots
3963           kk++;
3964         STKP->num = kk; // save index for next iteration
3965         if (kk < zlen) {
3966           struct zvalue *var = setup_lvalue(2, parmbase, &field_num);
3967           var->flags = ZF_STR;
3968           zstring_release(&var->vst);
3969           var->vst = MAPSLOT[kk].key;
3970           zstring_incr_refcnt(var->vst);
3971           ip += op2;
3972         }
3973         break;
3974 
3975       case tkvar:
3976         op2 = *ip++;
3977         k = op2 < 0 ? parmbase - op2 : op2;
3978         v = &STACK[k];
3979         push_val(v);
3980         break;
3981 
3982       case tkfield:
3983         // tkfield op has "dummy" 2nd word so that convert_push_to_reference(void)
3984         // can find either tkfield or tkvar at same place (ZCODE[TT.zcode_last-1]).
3985         ip++; // skip dummy "operand" instruction field
3986         push_field((int)(to_num(STKP)));
3987 
3988         swap();
3989         drop();
3990         break;
3991 
3992       case oppush:
3993         push_int_val(*ip++);
3994         break;
3995 
3996       case tkand:
3997         op2 = *ip++;
3998         if (get_set_logical()) drop();
3999         else ip += op2;
4000         break;
4001 
4002       case tkor:
4003         op2 = *ip++;
4004         if (!get_set_logical()) drop();
4005         else ip += op2;
4006         break;
4007 
4008       case tkwhile:
4009         (STKP)->num = ! get_set_logical();
4010         ATTR_FALLTHROUGH_INTENDED;
4011         // FALLTHROUGH to tkternif
4012       case tkif:
4013         // FALLTHROUGH to tkternif
4014       case tkternif:
4015         op2 = *ip++;
4016         int t = get_set_logical();  // FIXME only need to get, not set
4017         drop();
4018         if (!t) ip += op2;
4019         break;
4020 
4021       case tkelse:        // FALLTHROUGH intentional here
4022       case tkternelse:    // FALLTHROUGH intentional here
4023       case tkbreak:       // FALLTHROUGH intentional here
4024       case tkcontinue:    // FALLTHROUGH intentional here
4025       case opjump:
4026         op2 = *ip++;
4027         ip += op2;
4028         break;
4029 
4030       case opvarref:
4031         op2 = *ip++;
4032         vv = (struct zvalue)ZVINIT(ZF_REF, op2, 0);
4033         push_val(&vv);
4034         break;
4035 
4036       case opmapref:
4037         op2 = *ip++;
4038         vv = (struct zvalue)ZVINIT(ZF_MAPREF, op2, 0);
4039         push_val(&vv);
4040         break;
4041 
4042       case opfldref:
4043         to_num(STKP);
4044         (STKP)->flags |= ZF_FIELDREF;
4045         ip++; // skip dummy "operand" instruction field
4046         break;
4047 
4048       case opprintrec:
4049         puts(to_str(&FIELD[0])->vst->str);
4050         break;
4051 
4052       case oprange1:
4053         range_num = *ip++;
4054         op2 = *ip++;
4055         if (TT.range_sw[range_num]) ip += op2;
4056         break;
4057 
4058       case oprange2:
4059         range_num = *ip++;
4060         op2 = *ip++;
4061         t = get_set_logical();  // FIXME only need to get, not set
4062         drop();
4063         if (t) TT.range_sw[range_num] = 1;
4064         else ip += op2;
4065         break;
4066 
4067       case oprange3:
4068         range_num = *ip++;
4069         t = get_set_logical();  // FIXME only need to get, not set
4070         drop();
4071         if (t) TT.range_sw[range_num] = 0;
4072         break;
4073 
4074       case tkexit:
4075         r = popnumval();
4076         if (r != NO_EXIT_STATUS) *status = (int)r & 255;
4077         // TODO FIXME do we need NO_EXIT_STATUS at all? Just use 0?
4078         ATTR_FALLTHROUGH_INTENDED;
4079       case tknext:
4080       case tknextfile:
4081         return opcode;
4082 
4083       case tkgetline:
4084         nargs = *ip++;
4085         int source = *ip++;
4086         // TT.stack is:
4087         // if tkgetline 0 tkeof:   (nothing stacked; plain getline)
4088         // if tkgetline 1 tkeof:   (lvalue)
4089         // if tkgetline 1 tklt:    (filename_string)
4090         // if tkgetline 2 tklt:    (lvalue) (filename_string)
4091         // if tkgetline 1 tkpipe:  (pipe_command_string)
4092         // if tkgetline 2 tkpipe:  (pipe_command_string) (lvalue)
4093         // effect is to set:
4094         // if tkgetline 0 tkeof:   $0 NF NR FNR
4095         // if tkgetline 1 tkeof:   var NR FNR
4096         // if tkgetline 1 tklt:    $0 NF
4097         // if tkgetline 2 tklt:    var
4098         // if tkgetline 1 tkpipe:  $0 NF
4099         // if tkgetline 2 tkpipe:  var
4100         // Ensure pipe cmd on top
4101         if (nargs == 2 && source == tkpipe) swap();
4102         struct zfile *zfp = 0;
4103         if (source == tklt || source == tkpipe) {
4104           zfp = setup_file(source == tklt, "r");
4105           nargs--;
4106         }
4107         // now cases are:
4108         // nargs source  TT.stack
4109         //  0 tkeof:   (nothing; plain getline) from current data file
4110         //  1 tkeof:   (lvalue)  from current data file
4111         //  0 tklt:    (nothing) from named file in 'stream'
4112         //  1 tklt:    (lvalue)  from  named file in 'stream'
4113         //  0 tkpipe:  (nothing) from piped command in 'stream'
4114         //  1 tkpipe:  (lvalue)  from piped command in 'stream'
4115         v = nargs ? setup_lvalue(0, parmbase, &field_num) : 0;
4116         if (v) drop();
4117         // source is tkeof (no pipe/file), tklt (file), or tkpipe (pipe)
4118         // stream is name of file or pipe
4119         // v is NULL or an lvalue ref
4120         if (zfp != badfile) push_int_val(awk_getline(source, zfp, v));
4121         else push_int_val(-1);
4122 
4123         // fake return value for now
4124         break;
4125 
4126         ////// builtin functions ///////
4127 
4128       case tksplit:
4129         nargs = *ip++;
4130         if (nargs == 2) push_val(&STACK[FS]);
4131         struct zstring *s = to_str(STKP-2)->vst;
4132         force_maybemap_to_map(STKP-1);
4133         struct zvalue *a = STKP-1;
4134         struct zvalue *fs = STKP;
4135         zmap_delete_map(a->map);
4136         k = split(s, a, fs);
4137         drop_n(3);
4138         push_int_val(k);
4139         break;
4140 
4141       case tkmatch:
4142         nargs = *ip++;
4143         if (!IS_RX(STKP)) to_str(STKP);
4144         regex_t rx_pat, *rxp = &rx_pat;
4145         rx_zvalue_compile(&rxp, STKP);
4146         regoff_t rso = 0, reo = 0;  // shut up warning (may be uninit)
4147         k = rx_find(rxp, to_str(STKP-1)->vst->str, &rso, &reo, 0);
4148         rx_zvalue_free(rxp, STKP);
4149         // Force these to num before setting.
4150         to_num(&STACK[RSTART]);
4151         to_num(&STACK[RLENGTH]);
4152         if (k) STACK[RSTART].num = 0, STACK[RLENGTH].num = -1;
4153         else {
4154           reo = utf8cnt(STKP[-1].vst->str, reo);
4155           rso = utf8cnt(STKP[-1].vst->str, rso);
4156           STACK[RSTART].num = rso + 1, STACK[RLENGTH].num = reo - rso;
4157         }
4158         drop();
4159         drop();
4160         push_int_val(k ? 0 : rso + 1);
4161         break;
4162 
4163       case tksub:
4164       case tkgsub:
4165         gsub(opcode, *ip++, parmbase);  // tksub/tkgsub, args
4166         break;
4167 
4168       case tksubstr:
4169         nargs = *ip++;
4170         struct zstring *zz = to_str(STKP - nargs + 1)->vst;
4171         int nchars = utf8cnt(zz->str, zz->size);  // number of utf8 codepoints
4172         // Offset of start of string (in chars not bytes); convert 1-based to 0-based
4173         ssize_t mm = CLAMP(trunc(to_num(STKP - nargs + 2)) - 1, 0, nchars);
4174         ssize_t nn = nchars - mm;   // max possible substring length (chars)
4175         if (nargs == 3) nn = CLAMP(trunc(to_num(STKP)), 0, nn);
4176         mm = bytesinutf8(zz->str, zz->size, mm);
4177         nn = bytesinutf8(zz->str + mm, zz->size - mm, nn);
4178         struct zstring *zzz = new_zstring(zz->str + mm, nn);
4179         zstring_release(&(STKP - nargs + 1)->vst);
4180         (STKP - nargs + 1)->vst = zzz;
4181         drop_n(nargs - 1);
4182         break;
4183 
4184       case tkindex:
4185         nargs = *ip++;
4186         char *s1 = to_str(STKP-1)->vst->str;
4187         char *s3 = strstr(s1, to_str(STKP)->vst->str);
4188         ptrdiff_t offs = s3 ? utf8cnt(s1, s3 - s1) + 1 : 0;
4189         drop();
4190         drop();
4191         push_int_val(offs);
4192         break;
4193 
4194       case tkband:
4195       case tkbor:
4196       case tkbxor:
4197       case tklshift:
4198       case tkrshift:
4199         ; size_t acc = to_num(STKP);
4200         nargs = *ip++;
4201         for (int i = 1; i < nargs; i++) switch (opcode) {
4202           case tkband: acc &= (size_t)to_num(STKP-i); break;
4203           case tkbor:  acc |= (size_t)to_num(STKP-i); break;
4204           case tkbxor: acc ^= (size_t)to_num(STKP-i); break;
4205           case tklshift: acc = (size_t)to_num(STKP-i) << acc; break;
4206           case tkrshift: acc = (size_t)to_num(STKP-i) >> acc; break;
4207         }
4208         drop_n(nargs);
4209         push_int_val(acc);
4210         break;
4211 
4212       case tktolower:
4213       case tktoupper:
4214         nargs = *ip++;
4215         struct zstring *z = to_str(STKP)->vst;
4216         unsigned zzlen = z->size + 4; // Allow for expansion
4217         zz = zstring_update(0, zzlen, "", 0);
4218         char *p = z->str, *e = z->str + z->size, *q = zz->str;
4219         // Similar logic to toybox strlower(), but fixed.
4220         while (p < e) {
4221           unsigned wch;
4222           int len = utf8towc(&wch, p, e-p);
4223           if (len < 1) {  // nul byte, error, or truncated code
4224             *q++ = *p++;
4225             continue;
4226           }
4227           p += len;
4228           wch = (opcode == tktolower ? towlower : towupper)(wch);
4229           len = wctoutf8(q, wch);
4230           q += len;
4231           // Need realloc here if overflow possible
4232           if ((len = q - zz->str) + 4 < (int)zzlen) continue;
4233           zz = zstring_update(zz, zzlen = len + 16, "", 0);
4234           q = zz->str + len;
4235         }
4236         *q = 0;
4237         zz->size = q - zz->str;
4238         zstring_release(&z);
4239         STKP->vst = zz;
4240         break;
4241 
4242       case tklength:
4243         nargs = *ip++;
4244         v = nargs ? STKP : &FIELD[0];
4245         force_maybemap_to_map(v);
4246         if (IS_MAP(v)) k = v->map->count - v->map->deleted;
4247         else {
4248           to_str(v);
4249           k = utf8cnt(v->vst->str, v->vst->size);
4250         }
4251         if (nargs) drop();
4252         push_int_val(k);
4253         break;
4254 
4255       case tksystem:
4256         nargs = *ip++;
4257         fflush(stdout);
4258         fflush(stderr);
4259         r = system(to_str(STKP)->vst->str);
4260 #ifdef WEXITSTATUS
4261         // WEXITSTATUS is in sys/wait.h, but I'm not including that.
4262         // It seems to also be in stdlib.h in gcc and musl-gcc.
4263         // No idea how portable this is!
4264         if (WIFEXITED(r)) r = WEXITSTATUS(r);
4265 #endif
4266         drop();
4267         push_int_val(r);
4268         break;
4269 
4270       case tkfflush:
4271         nargs = *ip++;
4272         r = fflush_file(nargs);
4273         if (nargs) drop();
4274         push_int_val(r);
4275         break;
4276 
4277       case tkclose:
4278         nargs = *ip++;
4279         r = close_file(to_str(STKP)->vst->str);
4280         drop();
4281         push_int_val(r);
4282         break;
4283 
4284       case tksprintf:
4285         nargs = *ip++;
4286         zstring_release(&TT.rgl.zspr);
4287         TT.rgl.zspr = new_zstring("", 0);
4288         varprint(fsprintf, 0, nargs);
4289         drop_n(nargs);
4290         vv = (struct zvalue)ZVINIT(ZF_STR, 0, TT.rgl.zspr);
4291         push_val(&vv);
4292         break;
4293 
4294       // Math builtins -- move here (per Oliver Webb suggestion)
4295       case tkatan2:
4296         nargs = *ip++;
4297         d = atan2(to_num(STKP-1), to_num(STKP));
4298         drop();
4299         STKP->num = d;
4300         break;
4301       case tkrand:
4302         nargs = *ip++;
4303         push_int_val(0);
4304         // Get all 53 mantissa bits in play:
4305         // (upper 26 bits * 2^27 + upper 27 bits) / 2^53
4306         STKP->num =
4307           ((random() >> 5) * 134217728.0 + (random() >> 4)) / 9007199254740992.0;
4308         break;
4309       case tksrand:
4310         nargs = *ip++;
4311         if (nargs == 1) {
4312           STKP->num = seedrand(to_num(STKP));
4313         } else push_int_val(seedrand(millinow()));
4314         break;
4315       case tkcos: case tksin: case tkexp: case tklog: case tksqrt: case tkint:
4316         nargs = *ip++;
4317         STKP->num = mathfunc[opcode-tkcos](to_num(STKP));
4318         break;
4319 
4320       default:
4321         // This should never happen:
4322         error_exit("!!! Unimplemented opcode %d", opcode);
4323     }
4324   }
4325   return opquit;
4326 }
4327 
4328 // interp() wraps the main interpreter loop interpx(). The main purpose
4329 // is to allow the TT.stack to be readjusted after an 'exit' from a function.
4330 // Also catches errors, as the normal operation should leave the TT.stack
4331 // depth unchanged after each run through the rules.
interp(int start,int * status)4332 static int interp(int start, int *status)
4333 {
4334   int stkptrbefore = stkn(0);
4335   int r = interpx(start, status);
4336   // If exit from function, TT.stack will be loaded with args etc. Clean it.
4337   if (r == tkexit) {
4338     // TODO FIXME is this safe? Just remove extra entries?
4339     STKP = &STACK[stkptrbefore];
4340   }
4341   if (stkn(0) - stkptrbefore)
4342     error_exit("!!AWK BUG stack pointer offset: %d", stkn(0) - stkptrbefore);
4343   return r;
4344 }
4345 
insert_argv_map(struct zvalue * map,int key,char * value)4346 static void insert_argv_map(struct zvalue *map, int key, char *value)
4347 {
4348   struct zvalue zkey = ZVINIT(ZF_STR, 0, num_to_zstring(key, ENSURE_STR(&STACK[CONVFMT])->vst->str));
4349   struct zvalue *v = get_map_val(map, &zkey);
4350   zvalue_release_zstring(&zkey);
4351   zvalue_release_zstring(v);
4352   *v = new_str_val(value);
4353   check_numeric_string(v);
4354 }
4355 
init_globals(int optind,int argc,char ** argv,char * sepstring,struct arg_list * assign_args)4356 static void init_globals(int optind, int argc, char **argv, char *sepstring,
4357     struct arg_list *assign_args)
4358 {
4359   // Global variables reside at the bottom of the TT.stack. Start with the awk
4360   // "special variables":  ARGC, ARGV, CONVFMT, ENVIRON, FILENAME, FNR, FS, NF,
4361   // NR, OFMT, OFS, ORS, RLENGTH, RS, RSTART, SUBSEP
4362 
4363   STACK[CONVFMT] = new_str_val("%.6g");
4364   // Init ENVIRON map.
4365   struct zvalue m = ZVINIT(ZF_MAP, 0, 0);
4366   zvalue_map_init(&m);
4367   STACK[ENVIRON] = m;
4368   for (char **pkey = environ; *pkey; pkey++) {
4369     char *pval = strchr(*pkey, '=');
4370     if (!pval) continue;
4371     struct zvalue zkey = ZVINIT(ZF_STR, 0, new_zstring(*pkey, pval - *pkey));
4372     struct zvalue *v = get_map_val(&m, &zkey);
4373     zstring_release(&zkey.vst);
4374     if (v->vst) FFATAL("env var dup? (%s)", pkey);
4375     *v = new_str_val(++pval);    // FIXME refcnt
4376     check_numeric_string(v);
4377   }
4378 
4379   // Init ARGV map.
4380   m = (struct zvalue)ZVINIT(ZF_MAP, 0, 0);
4381   zvalue_map_init(&m);
4382   STACK[ARGV] = m;
4383   insert_argv_map(&m, 0, TT.progname);
4384   int nargc = 1;
4385   for (int k = optind; k < argc; k++) {
4386     insert_argv_map(&m, nargc, argv[k]);
4387     nargc++;
4388   }
4389 
4390   // Init rest of the awk special variables.
4391   STACK[ARGC] = (struct zvalue)ZVINIT(ZF_NUM, nargc, 0);
4392   STACK[FILENAME] = new_str_val("");
4393   STACK[FNR] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4394   STACK[FS] = new_str_val(sepstring);
4395   STACK[NF] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4396   STACK[NR] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4397   STACK[OFMT] = new_str_val("%.6g");
4398   STACK[OFS] = new_str_val(" ");
4399   STACK[ORS] = new_str_val("\n");
4400   STACK[RLENGTH] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4401   STACK[RS] = new_str_val("\n");
4402   STACK[RSTART] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4403   STACK[SUBSEP] = new_str_val("\034");
4404 
4405   // Init program globals.
4406   //
4407   // Push global variables on the TT.stack at offsets matching their index in the
4408   // global var table.  In the global var table we may have the type as scalar
4409   // or map if it is used as such in the program. In that case we init the
4410   // pushed arg from the type of the globals table.
4411   // But if a global var appears only as a bare arg in a function call it will
4412   // not be typed in the globals table. In that case we can only say it "may be"
4413   // a map, but we have to assume the possibility and attach a map to the
4414   // var. When/if the var is used as a map or scalar in the called function it
4415   // will be converted to a map or scalar as required.
4416   // See force_maybemap_to_scalar(), and the similar comment in
4417   // 'case tkfunction:' above.
4418   //
4419   int gstx, len = zlist_len(&TT.globals_table);
4420   for (gstx = TT.spec_var_limit; gstx < len; gstx++) {
4421     struct symtab_slot gs = GLOBAL[gstx];
4422     struct zvalue v = ZVINIT(gs.flags, 0, 0);
4423     if (v.flags == 0) {
4424       zvalue_map_init(&v);
4425       v.flags = ZF_MAYBEMAP;
4426     } else if (IS_MAP(&v)) {
4427       zvalue_map_init(&v);
4428     } else {
4429       // Set SCALAR flag 0 to create "uninitialized" scalar.
4430       v.flags = 0;
4431     }
4432     push_val(&v);
4433   }
4434 
4435   // Init -v assignment options.
4436   for (struct arg_list *p = assign_args; p; p = p->next) {
4437     char *asgn = p->arg;
4438     char *val = strchr(asgn, '=');
4439     if (!val) error_exit("bad -v assignment format");
4440     *val++ = 0;
4441     assign_global(asgn, val);
4442   }
4443 
4444   TT.rgl.cur_arg = new_str_val("<cmdline>");
4445   uninit_string_zvalue = new_str_val("");
4446   zvalue_copy(&FIELD[0], &uninit_string_zvalue);
4447 }
4448 
run_files(int * status)4449 static void run_files(int *status)
4450 {
4451   int r = 0;
4452   while (r != tkexit && *status < 0 && getrec_f0() >= 0)
4453     if ((r = interp(TT.cgl.first_recrule, status)) == tknextfile) next_fp();
4454 }
4455 
free_literal_regex(void)4456 static void free_literal_regex(void)
4457 {
4458   int len = zlist_len(&TT.literals);
4459   for (int k = 1; k < len; k++)
4460     if (IS_RX(&LITERAL[k])) regfree(LITERAL[k].rx);
4461 }
4462 
run(int optind,int argc,char ** argv,char * sepstring,struct arg_list * assign_args)4463 static void run(int optind, int argc, char **argv, char *sepstring,
4464     struct arg_list *assign_args)
4465 {
4466   char *printf_fmt_rx = "%[-+ #0']*([*]|[0-9]*)([.]([*]|[0-9]*))?l?[aAdiouxXfFeEgGcs%]";
4467   init_globals(optind, argc, argv, sepstring, assign_args);
4468   TT.cfile = xzalloc(sizeof(struct zfile));
4469   xregcomp(&TT.rx_default, "[ \t\n]+", REG_EXTENDED);
4470   xregcomp(&TT.rx_last, "[ \t\n]+", REG_EXTENDED);
4471   xregcomp(&TT.rx_printf_fmt, printf_fmt_rx, REG_EXTENDED);
4472   new_file("-", stdin, 'r', 'f')->is_std_file = 1;
4473   new_file("/dev/stdin", stdin, 'r', 'f')->is_std_file = 1;
4474   new_file("/dev/stdout", stdout, 'w', 'f')->is_std_file = 1;
4475   TT.zstdout = TT.zfiles;
4476   new_file("/dev/stderr", stderr, 'w', 'f')->is_std_file = 1;
4477   seedrand(123);
4478   int status = -1, r = 0;
4479   if (TT.cgl.first_begin) r = interp(TT.cgl.first_begin, &status);
4480   if (r != tkexit)
4481     if (TT.cgl.first_recrule) run_files(&status);
4482   if (TT.cgl.first_end) r = interp(TT.cgl.first_end, &status);
4483   regfree(&TT.rx_printf_fmt);
4484   regfree(&TT.rx_default);
4485   regfree(&TT.rx_last);
4486   free_literal_regex();
4487   close_file(0);    // close all files
4488   if (status >= 0) exit(status);
4489 }
4490 
4491 ////////////////////
4492 //// main
4493 ////////////////////
4494 
progfiles_init(char * progstring,struct arg_list * prog_args)4495 static void progfiles_init(char *progstring, struct arg_list *prog_args)
4496 {
4497   TT.scs->p = progstring ? progstring : "  " + 2;
4498   TT.scs->progstring = progstring;
4499   TT.scs->prog_args = prog_args;
4500   TT.scs->filename = "(cmdline)";
4501   TT.scs->maxtok = 256;
4502   TT.scs->tokstr = xzalloc(TT.scs->maxtok);
4503 }
4504 
awk(char * sepstring,char * progstring,struct arg_list * prog_args,struct arg_list * assign_args,int optind,int argc,char ** argv,int opt_run_prog)4505 static int awk(char *sepstring, char *progstring, struct arg_list *prog_args,
4506     struct arg_list *assign_args, int optind, int argc, char **argv,
4507     int opt_run_prog)
4508 {
4509   struct scanner_state ss = {0};
4510   TT.scs = &ss;
4511 
4512   setlocale(LC_NUMERIC, "");
4513   progfiles_init(progstring, prog_args);
4514   compile();
4515 
4516   if (TT.cgl.compile_error_count)
4517     error_exit("%d syntax error(s)", TT.cgl.compile_error_count);
4518   else {
4519     if (opt_run_prog)
4520       run(optind, argc, argv, sepstring, assign_args);
4521   }
4522 
4523   return TT.cgl.compile_error_count;
4524 }
4525 
awk_main(void)4526 void awk_main(void)
4527 {
4528   char *sepstring = TT.F ? escape_str(TT.F, 0) : " ";
4529   int optind = 0;
4530   char *progstring = NULL;
4531 
4532   TT.pbuf = toybuf;
4533   toys.exitval = 2;
4534   if (!TT.f) {
4535     if (*toys.optargs) progstring = toys.optargs[optind++];
4536     else error_exit("No program string\n");
4537   }
4538   TT.progname = toys.which->name;
4539   toys.exitval = awk(sepstring, progstring, TT.f, TT.v,
4540       optind, toys.optc, toys.optargs, !FLAG(c));
4541 }
4542