• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* xgettext PHP backend.
2    Copyright (C) 2001-2003, 2005-2010, 2014, 2018-2020 Free Software Foundation, Inc.
3 
4    This file was written by Bruno Haible <bruno@clisp.org>, 2002.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 
23 /* Specification.  */
24 #include "x-php.h"
25 
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 
31 #include "message.h"
32 #include "rc-str-list.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-mixed-string.h"
36 #include "xg-arglist-context.h"
37 #include "xg-arglist-callshape.h"
38 #include "xg-arglist-parser.h"
39 #include "xg-message.h"
40 #include "error.h"
41 #include "xalloc.h"
42 #include "gettext.h"
43 
44 #define _(s) gettext(s)
45 
46 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
47 
48 
49 /* The PHP syntax is defined in phpdoc/manual/langref.html.
50    See also php-4.1.0/Zend/zend_language_scanner.l
51    and      php-4.1.0/Zend/zend_language_parser.y.
52    Note that variable and function names can contain bytes in the range
53    0x7f..0xff; see
54      http://www.php.net/manual/en/language.variables.php
55      http://www.php.net/manual/en/language.functions.php  */
56 
57 
58 /* ====================== Keyword set customization.  ====================== */
59 
60 /* If true extract all strings.  */
61 static bool extract_all = false;
62 
63 static hash_table keywords;
64 static bool default_keywords = true;
65 
66 
67 void
x_php_extract_all()68 x_php_extract_all ()
69 {
70   extract_all = true;
71 }
72 
73 
74 void
x_php_keyword(const char * name)75 x_php_keyword (const char *name)
76 {
77   if (name == NULL)
78     default_keywords = false;
79   else
80     {
81       const char *end;
82       struct callshape shape;
83       const char *colon;
84 
85       if (keywords.table == NULL)
86         hash_init (&keywords, 100);
87 
88       split_keywordspec (name, &end, &shape);
89 
90       /* The characters between name and end should form a valid C identifier.
91          A colon means an invalid parse in split_keywordspec().  */
92       colon = strchr (name, ':');
93       if (colon == NULL || colon >= end)
94         insert_keyword_callshape (&keywords, name, end - name, &shape);
95     }
96 }
97 
98 /* Finish initializing the keywords hash table.
99    Called after argument processing, before each file is processed.  */
100 static void
init_keywords()101 init_keywords ()
102 {
103   if (default_keywords)
104     {
105       /* When adding new keywords here, also update the documentation in
106          xgettext.texi!  */
107       x_php_keyword ("_");
108       x_php_keyword ("gettext");
109       x_php_keyword ("dgettext:2");
110       x_php_keyword ("dcgettext:2");
111       /* The following were added in PHP 4.2.0.  */
112       x_php_keyword ("ngettext:1,2");
113       x_php_keyword ("dngettext:2,3");
114       x_php_keyword ("dcngettext:2,3");
115       default_keywords = false;
116     }
117 }
118 
119 void
init_flag_table_php()120 init_flag_table_php ()
121 {
122   xgettext_record_flag ("_:1:pass-php-format");
123   xgettext_record_flag ("gettext:1:pass-php-format");
124   xgettext_record_flag ("dgettext:2:pass-php-format");
125   xgettext_record_flag ("dcgettext:2:pass-php-format");
126   xgettext_record_flag ("ngettext:1:pass-php-format");
127   xgettext_record_flag ("ngettext:2:pass-php-format");
128   xgettext_record_flag ("dngettext:2:pass-php-format");
129   xgettext_record_flag ("dngettext:3:pass-php-format");
130   xgettext_record_flag ("dcngettext:2:pass-php-format");
131   xgettext_record_flag ("dcngettext:3:pass-php-format");
132   xgettext_record_flag ("sprintf:1:php-format");
133   xgettext_record_flag ("printf:1:php-format");
134 }
135 
136 
137 /* ======================== Reading of characters.  ======================== */
138 
139 /* The input file stream.  */
140 static FILE *fp;
141 
142 
143 /* 1. line_number handling.  */
144 
145 static unsigned char phase1_pushback[2];
146 static int phase1_pushback_length;
147 
148 static int
phase1_getc()149 phase1_getc ()
150 {
151   int c;
152 
153   if (phase1_pushback_length)
154     c = phase1_pushback[--phase1_pushback_length];
155   else
156     {
157       c = getc (fp);
158 
159       if (c == EOF)
160         {
161           if (ferror (fp))
162             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
163                    real_file_name);
164           return EOF;
165         }
166     }
167 
168   if (c == '\n')
169     line_number++;
170 
171   return c;
172 }
173 
174 /* Supports 2 characters of pushback.  */
175 static void
phase1_ungetc(int c)176 phase1_ungetc (int c)
177 {
178   if (c != EOF)
179     {
180       if (c == '\n')
181         --line_number;
182 
183       if (phase1_pushback_length == SIZEOF (phase1_pushback))
184         abort ();
185       phase1_pushback[phase1_pushback_length++] = c;
186     }
187 }
188 
189 
190 /* 2. Ignore HTML sections.  They are equivalent to PHP echo commands and
191    therefore don't contain translatable strings.  */
192 
193 static void
skip_html()194 skip_html ()
195 {
196   for (;;)
197     {
198       int c = phase1_getc ();
199 
200       if (c == EOF)
201         return;
202 
203       if (c == '<')
204         {
205           int c2 = phase1_getc ();
206 
207           if (c2 == EOF)
208             break;
209 
210           if (c2 == '?')
211             {
212               /* <?php is the normal way to enter PHP mode. <? and <?= are
213                  recognized by PHP depending on a configuration setting.  */
214               int c3 = phase1_getc ();
215 
216               if (c3 != '=')
217                 phase1_ungetc (c3);
218 
219               return;
220             }
221 
222           if (c2 == '%')
223             {
224               /* <% and <%= are recognized by PHP depending on a configuration
225                  setting.  */
226               int c3 = phase1_getc ();
227 
228               if (c3 != '=')
229                 phase1_ungetc (c3);
230 
231               return;
232             }
233 
234           if (c2 == '<')
235             {
236               phase1_ungetc (c2);
237               continue;
238             }
239 
240           /* < script language = php >
241              < script language = "php" >
242              < script language = 'php' >
243              are always recognized.  */
244           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
245             c2 = phase1_getc ();
246           if (c2 != 's' && c2 != 'S')
247             {
248               phase1_ungetc (c2);
249               continue;
250             }
251           c2 = phase1_getc ();
252           if (c2 != 'c' && c2 != 'C')
253             {
254               phase1_ungetc (c2);
255               continue;
256             }
257           c2 = phase1_getc ();
258           if (c2 != 'r' && c2 != 'R')
259             {
260               phase1_ungetc (c2);
261               continue;
262             }
263           c2 = phase1_getc ();
264           if (c2 != 'i' && c2 != 'I')
265             {
266               phase1_ungetc (c2);
267               continue;
268             }
269           c2 = phase1_getc ();
270           if (c2 != 'p' && c2 != 'P')
271             {
272               phase1_ungetc (c2);
273               continue;
274             }
275           c2 = phase1_getc ();
276           if (c2 != 't' && c2 != 'T')
277             {
278               phase1_ungetc (c2);
279               continue;
280             }
281           c2 = phase1_getc ();
282           if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'))
283             {
284               phase1_ungetc (c2);
285               continue;
286             }
287           do
288             c2 = phase1_getc ();
289           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
290           if (c2 != 'l' && c2 != 'L')
291             {
292               phase1_ungetc (c2);
293               continue;
294             }
295           c2 = phase1_getc ();
296           if (c2 != 'a' && c2 != 'A')
297             {
298               phase1_ungetc (c2);
299               continue;
300             }
301           c2 = phase1_getc ();
302           if (c2 != 'n' && c2 != 'N')
303             {
304               phase1_ungetc (c2);
305               continue;
306             }
307           c2 = phase1_getc ();
308           if (c2 != 'g' && c2 != 'G')
309             {
310               phase1_ungetc (c2);
311               continue;
312             }
313           c2 = phase1_getc ();
314           if (c2 != 'u' && c2 != 'U')
315             {
316               phase1_ungetc (c2);
317               continue;
318             }
319           c2 = phase1_getc ();
320           if (c2 != 'a' && c2 != 'A')
321             {
322               phase1_ungetc (c2);
323               continue;
324             }
325           c2 = phase1_getc ();
326           if (c2 != 'g' && c2 != 'G')
327             {
328               phase1_ungetc (c2);
329               continue;
330             }
331           c2 = phase1_getc ();
332           if (c2 != 'e' && c2 != 'E')
333             {
334               phase1_ungetc (c2);
335               continue;
336             }
337           c2 = phase1_getc ();
338           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
339             c2 = phase1_getc ();
340           if (c2 != '=')
341             {
342               phase1_ungetc (c2);
343               continue;
344             }
345           c2 = phase1_getc ();
346           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
347             c2 = phase1_getc ();
348           if (c2 == '"')
349             {
350               c2 = phase1_getc ();
351               if (c2 != 'p')
352                 {
353                   phase1_ungetc (c2);
354                   continue;
355                 }
356               c2 = phase1_getc ();
357               if (c2 != 'h')
358                 {
359                   phase1_ungetc (c2);
360                   continue;
361                 }
362               c2 = phase1_getc ();
363               if (c2 != 'p')
364                 {
365                   phase1_ungetc (c2);
366                   continue;
367                 }
368               c2 = phase1_getc ();
369               if (c2 != '"')
370                 {
371                   phase1_ungetc (c2);
372                   continue;
373                 }
374             }
375           else if (c2 == '\'')
376             {
377               c2 = phase1_getc ();
378               if (c2 != 'p')
379                 {
380                   phase1_ungetc (c2);
381                   continue;
382                 }
383               c2 = phase1_getc ();
384               if (c2 != 'h')
385                 {
386                   phase1_ungetc (c2);
387                   continue;
388                 }
389               c2 = phase1_getc ();
390               if (c2 != 'p')
391                 {
392                   phase1_ungetc (c2);
393                   continue;
394                 }
395               c2 = phase1_getc ();
396               if (c2 != '\'')
397                 {
398                   phase1_ungetc (c2);
399                   continue;
400                 }
401             }
402           else
403             {
404               if (c2 != 'p')
405                 {
406                   phase1_ungetc (c2);
407                   continue;
408                 }
409               c2 = phase1_getc ();
410               if (c2 != 'h')
411                 {
412                   phase1_ungetc (c2);
413                   continue;
414                 }
415               c2 = phase1_getc ();
416               if (c2 != 'p')
417                 {
418                   phase1_ungetc (c2);
419                   continue;
420                 }
421             }
422           c2 = phase1_getc ();
423           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
424             c2 = phase1_getc ();
425           if (c2 != '>')
426             {
427               phase1_ungetc (c2);
428               continue;
429             }
430           return;
431         }
432     }
433 }
434 
435 #if 0
436 
437 static unsigned char phase2_pushback[1];
438 static int phase2_pushback_length;
439 
440 static int
441 phase2_getc ()
442 {
443   int c;
444 
445   if (phase2_pushback_length)
446     return phase2_pushback[--phase2_pushback_length];
447 
448   c = phase1_getc ();
449   switch (c)
450     {
451     case '?':
452     case '%':
453       {
454         int c2 = phase1_getc ();
455         if (c2 == '>')
456           {
457             /* ?> and %> terminate PHP mode and switch back to HTML mode.  */
458             skip_html ();
459             return ' ';
460           }
461         phase1_ungetc (c2);
462       }
463       break;
464 
465     case '<':
466       {
467         int c2 = phase1_getc ();
468 
469         /* < / script > terminates PHP mode and switches back to HTML mode.  */
470         while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
471           c2 = phase1_getc ();
472         if (c2 == '/')
473           {
474             do
475               c2 = phase1_getc ();
476             while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
477             if (c2 == 's' || c2 == 'S')
478               {
479                 c2 = phase1_getc ();
480                 if (c2 == 'c' || c2 == 'C')
481                   {
482                     c2 = phase1_getc ();
483                     if (c2 == 'r' || c2 == 'R')
484                       {
485                         c2 = phase1_getc ();
486                         if (c2 == 'i' || c2 == 'I')
487                           {
488                             c2 = phase1_getc ();
489                             if (c2 == 'p' || c2 == 'P')
490                               {
491                                 c2 = phase1_getc ();
492                                 if (c2 == 't' || c2 == 'T')
493                                   {
494                                     do
495                                       c2 = phase1_getc ();
496                                     while (c2 == ' ' || c2 == '\t'
497                                            || c2 == '\n' || c2 == '\r');
498                                     if (c2 == '>')
499                                       {
500                                         skip_html ();
501                                         return ' ';
502                                       }
503                                   }
504                               }
505                           }
506                       }
507                   }
508               }
509           }
510         phase1_ungetc (c2);
511       }
512       break;
513     }
514 
515   return c;
516 }
517 
518 static void
519 phase2_ungetc (int c)
520 {
521   if (c != EOF)
522     {
523       if (phase2_pushback_length == SIZEOF (phase2_pushback))
524         abort ();
525       phase2_pushback[phase2_pushback_length++] = c;
526     }
527 }
528 
529 #endif
530 
531 
532 /* Accumulating comments.  */
533 
534 static char *buffer;
535 static size_t bufmax;
536 static size_t buflen;
537 
538 static inline void
comment_start()539 comment_start ()
540 {
541   buflen = 0;
542 }
543 
544 static inline void
comment_add(int c)545 comment_add (int c)
546 {
547   if (buflen >= bufmax)
548     {
549       bufmax = 2 * bufmax + 10;
550       buffer = xrealloc (buffer, bufmax);
551     }
552   buffer[buflen++] = c;
553 }
554 
555 static inline void
comment_line_end(size_t chars_to_remove)556 comment_line_end (size_t chars_to_remove)
557 {
558   buflen -= chars_to_remove;
559   while (buflen >= 1
560          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
561     --buflen;
562   if (chars_to_remove == 0 && buflen >= bufmax)
563     {
564       bufmax = 2 * bufmax + 10;
565       buffer = xrealloc (buffer, bufmax);
566     }
567   buffer[buflen] = '\0';
568   savable_comment_add (buffer);
569 }
570 
571 
572 /* 3. Replace each comment that is not inside a string literal with a
573    space character.  We need to remember the comment for later, because
574    it may be attached to a keyword string.  */
575 
576 /* These are for tracking whether comments count as immediately before
577    keyword.  */
578 static int last_comment_line;
579 static int last_non_comment_line;
580 
581 static unsigned char phase3_pushback[1];
582 static int phase3_pushback_length;
583 
584 static int
phase3_getc()585 phase3_getc ()
586 {
587   int lineno;
588   int c;
589 
590   if (phase3_pushback_length)
591     return phase3_pushback[--phase3_pushback_length];
592 
593   c = phase1_getc ();
594 
595   if (c == '#')
596     {
597       /* sh comment.  */
598       bool last_was_qmark = false;
599 
600       comment_start ();
601       lineno = line_number;
602       for (;;)
603         {
604           c = phase1_getc ();
605           if (c == '\n' || c == EOF)
606             {
607               comment_line_end (0);
608               break;
609             }
610           if (last_was_qmark && c == '>')
611             {
612               comment_line_end (1);
613               skip_html ();
614               break;
615             }
616           /* We skip all leading white space, but not EOLs.  */
617           if (!(buflen == 0 && (c == ' ' || c == '\t')))
618             comment_add (c);
619           last_was_qmark = (c == '?' || c == '%');
620         }
621       last_comment_line = lineno;
622       return '\n';
623     }
624   else if (c == '/')
625     {
626       c = phase1_getc ();
627 
628       switch (c)
629         {
630         default:
631           phase1_ungetc (c);
632           return '/';
633 
634         case '*':
635           {
636             /* C comment.  */
637             bool last_was_star;
638 
639             comment_start ();
640             lineno = line_number;
641             last_was_star = false;
642             for (;;)
643               {
644                 c = phase1_getc ();
645                 if (c == EOF)
646                   break;
647                 /* We skip all leading white space, but not EOLs.  */
648                 if (buflen == 0 && (c == ' ' || c == '\t'))
649                   continue;
650                 comment_add (c);
651                 switch (c)
652                   {
653                   case '\n':
654                     comment_line_end (1);
655                     comment_start ();
656                     lineno = line_number;
657                     last_was_star = false;
658                     continue;
659 
660                   case '*':
661                     last_was_star = true;
662                     continue;
663 
664                   case '/':
665                     if (last_was_star)
666                       {
667                         comment_line_end (2);
668                         break;
669                       }
670                     /* FALLTHROUGH */
671 
672                   default:
673                     last_was_star = false;
674                     continue;
675                   }
676                 break;
677               }
678             last_comment_line = lineno;
679             return ' ';
680           }
681 
682         case '/':
683           {
684             /* C++ comment.  */
685             bool last_was_qmark = false;
686 
687             comment_start ();
688             lineno = line_number;
689             for (;;)
690               {
691                 c = phase1_getc ();
692                 if (c == '\n' || c == EOF)
693                   {
694                     comment_line_end (0);
695                     break;
696                   }
697                 if (last_was_qmark && c == '>')
698                   {
699                     comment_line_end (1);
700                     skip_html ();
701                     break;
702                   }
703                 /* We skip all leading white space, but not EOLs.  */
704                 if (!(buflen == 0 && (c == ' ' || c == '\t')))
705                   comment_add (c);
706                 last_was_qmark = (c == '?' || c == '%');
707               }
708             last_comment_line = lineno;
709             return '\n';
710           }
711         }
712     }
713   else
714     return c;
715 }
716 
717 #ifdef unused
718 static void
phase3_ungetc(int c)719 phase3_ungetc (int c)
720 {
721   if (c != EOF)
722     {
723       if (phase3_pushback_length == SIZEOF (phase3_pushback))
724         abort ();
725       phase3_pushback[phase3_pushback_length++] = c;
726     }
727 }
728 #endif
729 
730 
731 /* ========================== Reading of tokens.  ========================== */
732 
733 
734 enum token_type_ty
735 {
736   token_type_eof,
737   token_type_lparen,            /* ( */
738   token_type_rparen,            /* ) */
739   token_type_comma,             /* , */
740   token_type_lbracket,          /* [ */
741   token_type_rbracket,          /* ] */
742   token_type_dot,               /* . */
743   token_type_operator1,         /* * / % ++ -- */
744   token_type_operator2,         /* + - ! ~ @ */
745   token_type_string_literal,    /* "abc" */
746   token_type_symbol,            /* symbol, number */
747   token_type_other              /* misc. operator */
748 };
749 typedef enum token_type_ty token_type_ty;
750 
751 typedef struct token_ty token_ty;
752 struct token_ty
753 {
754   token_type_ty type;
755   char *string;         /* for token_type_string_literal, token_type_symbol */
756   refcounted_string_list_ty *comment;   /* for token_type_string_literal */
757   int line_number;
758 };
759 
760 
761 /* Free the memory pointed to by a 'struct token_ty'.  */
762 static inline void
free_token(token_ty * tp)763 free_token (token_ty *tp)
764 {
765   if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
766     free (tp->string);
767   if (tp->type == token_type_string_literal)
768     drop_reference (tp->comment);
769 }
770 
771 
772 /* 4. Combine characters into tokens.  Discard whitespace.  */
773 
774 static token_ty phase4_pushback[3];
775 static int phase4_pushback_length;
776 
777 static void
phase4_get(token_ty * tp)778 phase4_get (token_ty *tp)
779 {
780   static char *buffer;
781   static int bufmax;
782   int bufpos;
783   int c;
784 
785   if (phase4_pushback_length)
786     {
787       *tp = phase4_pushback[--phase4_pushback_length];
788       return;
789     }
790   tp->string = NULL;
791 
792   for (;;)
793     {
794       tp->line_number = line_number;
795       c = phase3_getc ();
796       switch (c)
797         {
798         case EOF:
799           tp->type = token_type_eof;
800           return;
801 
802         case '\n':
803           if (last_non_comment_line > last_comment_line)
804             savable_comment_reset ();
805           /* FALLTHROUGH */
806         case ' ':
807         case '\t':
808         case '\r':
809           /* Ignore whitespace.  */
810           continue;
811         }
812 
813       last_non_comment_line = tp->line_number;
814 
815       switch (c)
816         {
817         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
818         case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
819         case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
820         case 'V': case 'W': case 'X': case 'Y': case 'Z':
821         case '_':
822         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
823         case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
824         case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
825         case 'v': case 'w': case 'x': case 'y': case 'z':
826         case 127: case 128: case 129: case 130: case 131: case 132: case 133:
827         case 134: case 135: case 136: case 137: case 138: case 139: case 140:
828         case 141: case 142: case 143: case 144: case 145: case 146: case 147:
829         case 148: case 149: case 150: case 151: case 152: case 153: case 154:
830         case 155: case 156: case 157: case 158: case 159: case 160: case 161:
831         case 162: case 163: case 164: case 165: case 166: case 167: case 168:
832         case 169: case 170: case 171: case 172: case 173: case 174: case 175:
833         case 176: case 177: case 178: case 179: case 180: case 181: case 182:
834         case 183: case 184: case 185: case 186: case 187: case 188: case 189:
835         case 190: case 191: case 192: case 193: case 194: case 195: case 196:
836         case 197: case 198: case 199: case 200: case 201: case 202: case 203:
837         case 204: case 205: case 206: case 207: case 208: case 209: case 210:
838         case 211: case 212: case 213: case 214: case 215: case 216: case 217:
839         case 218: case 219: case 220: case 221: case 222: case 223: case 224:
840         case 225: case 226: case 227: case 228: case 229: case 230: case 231:
841         case 232: case 233: case 234: case 235: case 236: case 237: case 238:
842         case 239: case 240: case 241: case 242: case 243: case 244: case 245:
843         case 246: case 247: case 248: case 249: case 250: case 251: case 252:
844         case 253: case 254: case 255:
845           bufpos = 0;
846           for (;;)
847             {
848               if (bufpos >= bufmax)
849                 {
850                   bufmax = 2 * bufmax + 10;
851                   buffer = xrealloc (buffer, bufmax);
852                 }
853               buffer[bufpos++] = c;
854               c = phase1_getc ();
855               switch (c)
856                 {
857                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
858                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
859                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
860                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
861                 case 'Y': case 'Z':
862                 case '_':
863                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
864                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
865                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
866                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
867                 case 'y': case 'z':
868                 case '0': case '1': case '2': case '3': case '4':
869                 case '5': case '6': case '7': case '8': case '9':
870                 case 127: case 128: case 129: case 130: case 131: case 132:
871                 case 133: case 134: case 135: case 136: case 137: case 138:
872                 case 139: case 140: case 141: case 142: case 143: case 144:
873                 case 145: case 146: case 147: case 148: case 149: case 150:
874                 case 151: case 152: case 153: case 154: case 155: case 156:
875                 case 157: case 158: case 159: case 160: case 161: case 162:
876                 case 163: case 164: case 165: case 166: case 167: case 168:
877                 case 169: case 170: case 171: case 172: case 173: case 174:
878                 case 175: case 176: case 177: case 178: case 179: case 180:
879                 case 181: case 182: case 183: case 184: case 185: case 186:
880                 case 187: case 188: case 189: case 190: case 191: case 192:
881                 case 193: case 194: case 195: case 196: case 197: case 198:
882                 case 199: case 200: case 201: case 202: case 203: case 204:
883                 case 205: case 206: case 207: case 208: case 209: case 210:
884                 case 211: case 212: case 213: case 214: case 215: case 216:
885                 case 217: case 218: case 219: case 220: case 221: case 222:
886                 case 223: case 224: case 225: case 226: case 227: case 228:
887                 case 229: case 230: case 231: case 232: case 233: case 234:
888                 case 235: case 236: case 237: case 238: case 239: case 240:
889                 case 241: case 242: case 243: case 244: case 245: case 246:
890                 case 247: case 248: case 249: case 250: case 251: case 252:
891                 case 253: case 254: case 255:
892                   continue;
893 
894                 default:
895                   phase1_ungetc (c);
896                   break;
897                 }
898               break;
899             }
900           if (bufpos >= bufmax)
901             {
902               bufmax = 2 * bufmax + 10;
903               buffer = xrealloc (buffer, bufmax);
904             }
905           buffer[bufpos] = 0;
906           tp->string = xstrdup (buffer);
907           tp->type = token_type_symbol;
908           return;
909 
910         case '\'':
911           /* Single-quoted string literal.  */
912           bufpos = 0;
913           for (;;)
914             {
915               c = phase1_getc ();
916               if (c == EOF || c == '\'')
917                 break;
918               if (c == '\\')
919                 {
920                   c = phase1_getc ();
921                   if (c != '\\' && c != '\'')
922                     {
923                       phase1_ungetc (c);
924                       c = '\\';
925                     }
926                 }
927               if (bufpos >= bufmax)
928                 {
929                   bufmax = 2 * bufmax + 10;
930                   buffer = xrealloc (buffer, bufmax);
931                 }
932               buffer[bufpos++] = c;
933             }
934           if (bufpos >= bufmax)
935             {
936               bufmax = 2 * bufmax + 10;
937               buffer = xrealloc (buffer, bufmax);
938             }
939           buffer[bufpos] = 0;
940           tp->type = token_type_string_literal;
941           tp->string = xstrdup (buffer);
942           tp->comment = add_reference (savable_comment);
943           return;
944 
945         case '"':
946           /* Double-quoted string literal.  */
947           tp->type = token_type_string_literal;
948           bufpos = 0;
949           for (;;)
950             {
951               c = phase1_getc ();
952               if (c == EOF || c == '"')
953                 break;
954               if (c == '$')
955                 {
956                   c = phase1_getc ();
957                   if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
958                       || c == '_' || c == '{' || c >= 0x7f)
959                     {
960                       /* String with variables.  */
961                       tp->type = token_type_other;
962                       continue;
963                     }
964                   phase1_ungetc (c);
965                   c = '$';
966                 }
967               if (c == '{')
968                 {
969                   c = phase1_getc ();
970                   if (c == '$')
971                     {
972                       /* String with expressions.  */
973                       tp->type = token_type_other;
974                       continue;
975                     }
976                   phase1_ungetc (c);
977                   c = '{';
978                 }
979               if (c == '\\')
980                 {
981                   int n, j;
982 
983                   c = phase1_getc ();
984                   switch (c)
985                     {
986                     case '"':
987                     case '\\':
988                     case '$':
989                       break;
990 
991                     case '0': case '1': case '2': case '3':
992                     case '4': case '5': case '6': case '7':
993                       n = 0;
994                       for (j = 0; j < 3; ++j)
995                         {
996                           n = n * 8 + c - '0';
997                           c = phase1_getc ();
998                           switch (c)
999                             {
1000                             default:
1001                               break;
1002 
1003                             case '0': case '1': case '2': case '3':
1004                             case '4': case '5': case '6': case '7':
1005                               continue;
1006                             }
1007                           break;
1008                         }
1009                       phase1_ungetc (c);
1010                       c = n;
1011                       break;
1012 
1013                     case 'x':
1014                       n = 0;
1015                       for (j = 0; j < 2; ++j)
1016                         {
1017                           c = phase1_getc ();
1018                           switch (c)
1019                             {
1020                             case '0': case '1': case '2': case '3': case '4':
1021                             case '5': case '6': case '7': case '8': case '9':
1022                               n = n * 16 + c - '0';
1023                               break;
1024                             case 'A': case 'B': case 'C': case 'D': case 'E':
1025                             case 'F':
1026                               n = n * 16 + 10 + c - 'A';
1027                               break;
1028                             case 'a': case 'b': case 'c': case 'd': case 'e':
1029                             case 'f':
1030                               n = n * 16 + 10 + c - 'a';
1031                               break;
1032                             default:
1033                               phase1_ungetc (c);
1034                               c = 0;
1035                               break;
1036                             }
1037                           if (c == 0)
1038                             break;
1039                         }
1040                       if (j == 0)
1041                         {
1042                           phase1_ungetc ('x');
1043                           c = '\\';
1044                         }
1045                       else
1046                         c = n;
1047                       break;
1048 
1049                     case 'n':
1050                       c = '\n';
1051                       break;
1052                     case 't':
1053                       c = '\t';
1054                       break;
1055                     case 'r':
1056                       c = '\r';
1057                       break;
1058 
1059                     default:
1060                       phase1_ungetc (c);
1061                       c = '\\';
1062                       break;
1063                     }
1064                 }
1065               if (bufpos >= bufmax)
1066                 {
1067                   bufmax = 2 * bufmax + 10;
1068                   buffer = xrealloc (buffer, bufmax);
1069                 }
1070               buffer[bufpos++] = c;
1071             }
1072           if (bufpos >= bufmax)
1073             {
1074               bufmax = 2 * bufmax + 10;
1075               buffer = xrealloc (buffer, bufmax);
1076             }
1077           buffer[bufpos] = 0;
1078           if (tp->type == token_type_string_literal)
1079             {
1080               tp->string = xstrdup (buffer);
1081               tp->comment = add_reference (savable_comment);
1082             }
1083           return;
1084 
1085         case '?':
1086         case '%':
1087           {
1088             int c2 = phase1_getc ();
1089             if (c2 == '>')
1090               {
1091                 /* ?> and %> terminate PHP mode and switch back to HTML
1092                    mode.  */
1093                 skip_html ();
1094                 tp->type = token_type_other;
1095               }
1096             else
1097               {
1098                 phase1_ungetc (c2);
1099                 tp->type = (c == '%' ? token_type_operator1 : token_type_other);
1100               }
1101             return;
1102           }
1103 
1104         case '(':
1105           tp->type = token_type_lparen;
1106           return;
1107 
1108         case ')':
1109           tp->type = token_type_rparen;
1110           return;
1111 
1112         case ',':
1113           tp->type = token_type_comma;
1114           return;
1115 
1116         case '[':
1117           tp->type = token_type_lbracket;
1118           return;
1119 
1120         case ']':
1121           tp->type = token_type_rbracket;
1122           return;
1123 
1124         case '.':
1125           tp->type = token_type_dot;
1126           return;
1127 
1128         case '*':
1129         case '/':
1130           tp->type = token_type_operator1;
1131           return;
1132 
1133         case '+':
1134         case '-':
1135           {
1136             int c2 = phase1_getc ();
1137             if (c2 == c)
1138               /* ++ or -- */
1139               tp->type = token_type_operator1;
1140             else
1141               /* + or - */
1142               {
1143                 phase1_ungetc (c2);
1144                 tp->type = token_type_operator2;
1145               }
1146             return;
1147           }
1148 
1149         case '!':
1150         case '~':
1151         case '@':
1152           tp->type = token_type_operator2;
1153           return;
1154 
1155         case '<':
1156           {
1157             int c2 = phase1_getc ();
1158             if (c2 == '<')
1159               {
1160                 int c3 = phase1_getc ();
1161                 if (c3 == '<')
1162                   {
1163                     int label_start = 0;
1164 
1165                     /* Start of here and now document.
1166                        Parse whitespace, then label, then newline.  */
1167                     do
1168                       c = phase3_getc ();
1169                     while (c == ' ' || c == '\t' || c == '\n' || c == '\r');
1170 
1171                     bufpos = 0;
1172                     do
1173                       {
1174                         if (bufpos >= bufmax)
1175                           {
1176                             bufmax = 2 * bufmax + 10;
1177                             buffer = xrealloc (buffer, bufmax);
1178                           }
1179                         buffer[bufpos++] = c;
1180                         c = phase3_getc ();
1181                       }
1182                     while (c != EOF && c != '\n' && c != '\r');
1183                     /* buffer[0..bufpos-1] now contains the label
1184                        (including single or double quotes).  */
1185 
1186                     if (*buffer == '\'' || *buffer == '"')
1187                       {
1188                         label_start++;
1189                         bufpos--;
1190                       }
1191 
1192                     /* Now skip the here document.  */
1193                     for (;;)
1194                       {
1195                         c = phase1_getc ();
1196                         if (c == EOF)
1197                           break;
1198                         if (c == '\n' || c == '\r')
1199                           {
1200                             int bufidx = label_start;
1201 
1202                             while (bufidx < bufpos)
1203                               {
1204                                 c = phase1_getc ();
1205                                 if (c == EOF)
1206                                   break;
1207                                 if (c != buffer[bufidx])
1208                                   {
1209                                     phase1_ungetc (c);
1210                                     break;
1211                                   }
1212                                 bufidx++;
1213                               }
1214                             if (bufidx == bufpos)
1215                               {
1216                                 c = phase1_getc ();
1217                                 if (c != ';')
1218                                   phase1_ungetc (c);
1219                                 c = phase1_getc ();
1220                                 if (c == '\n' || c == '\r')
1221                                   break;
1222                               }
1223                           }
1224                       }
1225 
1226                     /* FIXME: Ideally we should turn the here document into a
1227                        string literal if it didn't contain $ substitution.  And
1228                        we should also respect backslash escape sequences like
1229                        in double-quoted strings.  */
1230                     tp->type = token_type_other;
1231                     return;
1232                   }
1233                 phase1_ungetc (c3);
1234               }
1235 
1236             /* < / script > terminates PHP mode and switches back to HTML
1237                mode.  */
1238             while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
1239               c2 = phase1_getc ();
1240             if (c2 == '/')
1241               {
1242                 do
1243                   c2 = phase1_getc ();
1244                 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
1245                 if (c2 == 's' || c2 == 'S')
1246                   {
1247                     c2 = phase1_getc ();
1248                     if (c2 == 'c' || c2 == 'C')
1249                       {
1250                         c2 = phase1_getc ();
1251                         if (c2 == 'r' || c2 == 'R')
1252                           {
1253                             c2 = phase1_getc ();
1254                             if (c2 == 'i' || c2 == 'I')
1255                               {
1256                                 c2 = phase1_getc ();
1257                                 if (c2 == 'p' || c2 == 'P')
1258                                   {
1259                                     c2 = phase1_getc ();
1260                                     if (c2 == 't' || c2 == 'T')
1261                                       {
1262                                         do
1263                                           c2 = phase1_getc ();
1264                                         while (c2 == ' ' || c2 == '\t'
1265                                                || c2 == '\n' || c2 == '\r');
1266                                         if (c2 == '>')
1267                                           {
1268                                             skip_html ();
1269                                           }
1270                                         else
1271                                           phase1_ungetc (c2);
1272                                       }
1273                                     else
1274                                       phase1_ungetc (c2);
1275                                   }
1276                                 else
1277                                   phase1_ungetc (c2);
1278                               }
1279                             else
1280                               phase1_ungetc (c2);
1281                           }
1282                         else
1283                           phase1_ungetc (c2);
1284                       }
1285                     else
1286                       phase1_ungetc (c2);
1287                   }
1288                 else
1289                   phase1_ungetc (c2);
1290               }
1291             else
1292               phase1_ungetc (c2);
1293 
1294             tp->type = token_type_other;
1295             return;
1296           }
1297 
1298         case '`':
1299           /* Execution operator.  */
1300         default:
1301           /* We could carefully recognize each of the 2 and 3 character
1302              operators, but it is not necessary, as we only need to recognize
1303              gettext invocations.  Don't bother.  */
1304           tp->type = token_type_other;
1305           return;
1306         }
1307     }
1308 }
1309 
1310 /* Supports 3 tokens of pushback.  */
1311 static void
phase4_unget(token_ty * tp)1312 phase4_unget (token_ty *tp)
1313 {
1314   if (tp->type != token_type_eof)
1315     {
1316       if (phase4_pushback_length == SIZEOF (phase4_pushback))
1317         abort ();
1318       phase4_pushback[phase4_pushback_length++] = *tp;
1319     }
1320 }
1321 
1322 
1323 /* 5. Compile-time optimization of string literal concatenation.
1324    Combine "string1" . ... . "stringN" to the concatenated string if
1325      - the token before this expression is none of
1326        '+' '-' '.' '*' '/' '%' '!' '~' '++' '--' ')' '@'
1327        (because then the first string could be part of an expression with
1328        the same or higher precedence as '.', such as an additive,
1329        multiplicative, negation, preincrement, or cast expression),
1330      - the token after this expression is none of
1331        '*' '/' '%' '++' '--'
1332        (because then the last string could be part of an expression with
1333        higher precedence as '.', such as a multiplicative or postincrement
1334        expression).  */
1335 
1336 static token_type_ty phase5_last;
1337 
1338 static void
x_php_lex(token_ty * tp)1339 x_php_lex (token_ty *tp)
1340 {
1341   phase4_get (tp);
1342   if (tp->type == token_type_string_literal
1343       && !(phase5_last == token_type_dot
1344            || phase5_last == token_type_operator1
1345            || phase5_last == token_type_operator2
1346            || phase5_last == token_type_rparen))
1347     {
1348       char *sum = tp->string;
1349       size_t sum_len = strlen (sum);
1350 
1351       for (;;)
1352         {
1353           token_ty token2;
1354 
1355           phase4_get (&token2);
1356           if (token2.type == token_type_dot)
1357             {
1358               token_ty token3;
1359 
1360               phase4_get (&token3);
1361               if (token3.type == token_type_string_literal)
1362                 {
1363                   token_ty token_after;
1364 
1365                   phase4_get (&token_after);
1366                   if (token_after.type != token_type_operator1)
1367                     {
1368                       char *addend = token3.string;
1369                       size_t addend_len = strlen (addend);
1370 
1371                       sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1372                       memcpy (sum + sum_len, addend, addend_len + 1);
1373                       sum_len += addend_len;
1374 
1375                       phase4_unget (&token_after);
1376                       free_token (&token3);
1377                       free_token (&token2);
1378                       continue;
1379                     }
1380                   phase4_unget (&token_after);
1381                 }
1382               phase4_unget (&token3);
1383             }
1384           phase4_unget (&token2);
1385           break;
1386         }
1387       tp->string = sum;
1388     }
1389   phase5_last = tp->type;
1390 }
1391 
1392 
1393 /* ========================= Extracting strings.  ========================== */
1394 
1395 
1396 /* Context lookup table.  */
1397 static flag_context_list_table_ty *flag_context_list_table;
1398 
1399 
1400 /* The file is broken into tokens.  Scan the token stream, looking for
1401    a keyword, followed by a left paren, followed by a string.  When we
1402    see this sequence, we have something to remember.  We assume we are
1403    looking at a valid C or C++ program, and leave the complaints about
1404    the grammar to the compiler.
1405 
1406      Normal handling: Look for
1407        keyword ( ... msgid ... )
1408      Plural handling: Look for
1409        keyword ( ... msgid ... msgid_plural ... )
1410 
1411    We use recursion because the arguments before msgid or between msgid
1412    and msgid_plural can contain subexpressions of the same form.  */
1413 
1414 
1415 /* Extract messages until the next balanced closing parenthesis or bracket.
1416    Extracted messages are added to MLP.
1417    DELIM can be either token_type_rparen or token_type_rbracket, or
1418    token_type_eof to accept both.
1419    Return true upon eof, false upon closing parenthesis or bracket.  */
1420 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1421 extract_balanced (message_list_ty *mlp,
1422                   token_type_ty delim,
1423                   flag_context_ty outer_context,
1424                   flag_context_list_iterator_ty context_iter,
1425                   struct arglist_parser *argparser)
1426 {
1427   /* Current argument number.  */
1428   int arg = 1;
1429   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1430   int state;
1431   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1432   const struct callshapes *next_shapes = NULL;
1433   /* Context iterator that will be used if the next token is a '('.  */
1434   flag_context_list_iterator_ty next_context_iter =
1435     passthrough_context_list_iterator;
1436   /* Current context.  */
1437   flag_context_ty inner_context =
1438     inherited_context (outer_context,
1439                        flag_context_list_iterator_advance (&context_iter));
1440 
1441   /* Start state is 0.  */
1442   state = 0;
1443 
1444   for (;;)
1445     {
1446       token_ty token;
1447 
1448       x_php_lex (&token);
1449       switch (token.type)
1450         {
1451         case token_type_symbol:
1452           {
1453             void *keyword_value;
1454 
1455             if (hash_find_entry (&keywords, token.string, strlen (token.string),
1456                                  &keyword_value)
1457                 == 0)
1458               {
1459                 next_shapes = (const struct callshapes *) keyword_value;
1460                 state = 1;
1461               }
1462             else
1463               state = 0;
1464           }
1465           next_context_iter =
1466             flag_context_list_iterator (
1467               flag_context_list_table_lookup (
1468                 flag_context_list_table,
1469                 token.string, strlen (token.string)));
1470           free (token.string);
1471           continue;
1472 
1473         case token_type_lparen:
1474           if (extract_balanced (mlp, token_type_rparen,
1475                                 inner_context, next_context_iter,
1476                                 arglist_parser_alloc (mlp,
1477                                                       state ? next_shapes : NULL)))
1478             {
1479               arglist_parser_done (argparser, arg);
1480               return true;
1481             }
1482           next_context_iter = null_context_list_iterator;
1483           state = 0;
1484           continue;
1485 
1486         case token_type_rparen:
1487           if (delim == token_type_rparen || delim == token_type_eof)
1488             {
1489               arglist_parser_done (argparser, arg);
1490               return false;
1491             }
1492           next_context_iter = null_context_list_iterator;
1493           state = 0;
1494           continue;
1495 
1496         case token_type_comma:
1497           arg++;
1498           inner_context =
1499             inherited_context (outer_context,
1500                                flag_context_list_iterator_advance (
1501                                  &context_iter));
1502           next_context_iter = passthrough_context_list_iterator;
1503           state = 0;
1504           continue;
1505 
1506         case token_type_lbracket:
1507           if (extract_balanced (mlp, token_type_rbracket,
1508                                 null_context, null_context_list_iterator,
1509                                 arglist_parser_alloc (mlp, NULL)))
1510             {
1511               arglist_parser_done (argparser, arg);
1512               return true;
1513             }
1514           next_context_iter = null_context_list_iterator;
1515           state = 0;
1516           continue;
1517 
1518         case token_type_rbracket:
1519           if (delim == token_type_rbracket || delim == token_type_eof)
1520             {
1521               arglist_parser_done (argparser, arg);
1522               return false;
1523             }
1524           next_context_iter = null_context_list_iterator;
1525           state = 0;
1526           continue;
1527 
1528         case token_type_string_literal:
1529           {
1530             lex_pos_ty pos;
1531             pos.file_name = logical_file_name;
1532             pos.line_number = token.line_number;
1533 
1534             if (extract_all)
1535               remember_a_message (mlp, NULL, token.string, false, false,
1536                                   inner_context, &pos,
1537                                   NULL, token.comment, false);
1538             else
1539               {
1540                 mixed_string_ty *ms =
1541                   mixed_string_alloc_simple (token.string, lc_string,
1542                                              pos.file_name, pos.line_number);
1543                 free (token.string);
1544                 arglist_parser_remember (argparser, arg, ms, inner_context,
1545                                          pos.file_name, pos.line_number,
1546                                          token.comment, false);
1547               }
1548             drop_reference (token.comment);
1549           }
1550           next_context_iter = null_context_list_iterator;
1551           state = 0;
1552           continue;
1553 
1554         case token_type_dot:
1555         case token_type_operator1:
1556         case token_type_operator2:
1557         case token_type_other:
1558           next_context_iter = null_context_list_iterator;
1559           state = 0;
1560           continue;
1561 
1562         case token_type_eof:
1563           arglist_parser_done (argparser, arg);
1564           return true;
1565 
1566         default:
1567           abort ();
1568         }
1569     }
1570 }
1571 
1572 
1573 void
extract_php(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1574 extract_php (FILE *f,
1575              const char *real_filename, const char *logical_filename,
1576              flag_context_list_table_ty *flag_table,
1577              msgdomain_list_ty *mdlp)
1578 {
1579   message_list_ty *mlp = mdlp->item[0]->messages;
1580 
1581   fp = f;
1582   real_file_name = real_filename;
1583   logical_file_name = xstrdup (logical_filename);
1584   line_number = 1;
1585 
1586   phase1_pushback_length = 0;
1587 #if 0
1588   phase2_pushback_length = 0;
1589 #endif
1590 
1591   last_comment_line = -1;
1592   last_non_comment_line = -1;
1593 
1594   phase3_pushback_length = 0;
1595   phase4_pushback_length = 0;
1596 
1597   phase5_last = token_type_eof;
1598 
1599   flag_context_list_table = flag_table;
1600 
1601   init_keywords ();
1602 
1603   /* Initial mode is HTML mode, not PHP mode.  */
1604   skip_html ();
1605 
1606   /* Eat tokens until eof is seen.  When extract_balanced returns
1607      due to an unbalanced closing parenthesis, just restart it.  */
1608   while (!extract_balanced (mlp, token_type_eof,
1609                             null_context, null_context_list_iterator,
1610                             arglist_parser_alloc (mlp, NULL)))
1611     ;
1612 
1613   /* Close scanner.  */
1614   fp = NULL;
1615   real_file_name = NULL;
1616   logical_file_name = NULL;
1617   line_number = 0;
1618 }
1619