1 /* xgettext PHP backend.
2 Copyright (C) 2001-2003, 2005-2010, 2014, 2018-2020 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <bruno@clisp.org>, 2002.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-php.h"
25
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30
31 #include "message.h"
32 #include "rc-str-list.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-mixed-string.h"
36 #include "xg-arglist-context.h"
37 #include "xg-arglist-callshape.h"
38 #include "xg-arglist-parser.h"
39 #include "xg-message.h"
40 #include "error.h"
41 #include "xalloc.h"
42 #include "gettext.h"
43
44 #define _(s) gettext(s)
45
46 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
47
48
49 /* The PHP syntax is defined in phpdoc/manual/langref.html.
50 See also php-4.1.0/Zend/zend_language_scanner.l
51 and php-4.1.0/Zend/zend_language_parser.y.
52 Note that variable and function names can contain bytes in the range
53 0x7f..0xff; see
54 http://www.php.net/manual/en/language.variables.php
55 http://www.php.net/manual/en/language.functions.php */
56
57
58 /* ====================== Keyword set customization. ====================== */
59
60 /* If true extract all strings. */
61 static bool extract_all = false;
62
63 static hash_table keywords;
64 static bool default_keywords = true;
65
66
67 void
x_php_extract_all()68 x_php_extract_all ()
69 {
70 extract_all = true;
71 }
72
73
74 void
x_php_keyword(const char * name)75 x_php_keyword (const char *name)
76 {
77 if (name == NULL)
78 default_keywords = false;
79 else
80 {
81 const char *end;
82 struct callshape shape;
83 const char *colon;
84
85 if (keywords.table == NULL)
86 hash_init (&keywords, 100);
87
88 split_keywordspec (name, &end, &shape);
89
90 /* The characters between name and end should form a valid C identifier.
91 A colon means an invalid parse in split_keywordspec(). */
92 colon = strchr (name, ':');
93 if (colon == NULL || colon >= end)
94 insert_keyword_callshape (&keywords, name, end - name, &shape);
95 }
96 }
97
98 /* Finish initializing the keywords hash table.
99 Called after argument processing, before each file is processed. */
100 static void
init_keywords()101 init_keywords ()
102 {
103 if (default_keywords)
104 {
105 /* When adding new keywords here, also update the documentation in
106 xgettext.texi! */
107 x_php_keyword ("_");
108 x_php_keyword ("gettext");
109 x_php_keyword ("dgettext:2");
110 x_php_keyword ("dcgettext:2");
111 /* The following were added in PHP 4.2.0. */
112 x_php_keyword ("ngettext:1,2");
113 x_php_keyword ("dngettext:2,3");
114 x_php_keyword ("dcngettext:2,3");
115 default_keywords = false;
116 }
117 }
118
119 void
init_flag_table_php()120 init_flag_table_php ()
121 {
122 xgettext_record_flag ("_:1:pass-php-format");
123 xgettext_record_flag ("gettext:1:pass-php-format");
124 xgettext_record_flag ("dgettext:2:pass-php-format");
125 xgettext_record_flag ("dcgettext:2:pass-php-format");
126 xgettext_record_flag ("ngettext:1:pass-php-format");
127 xgettext_record_flag ("ngettext:2:pass-php-format");
128 xgettext_record_flag ("dngettext:2:pass-php-format");
129 xgettext_record_flag ("dngettext:3:pass-php-format");
130 xgettext_record_flag ("dcngettext:2:pass-php-format");
131 xgettext_record_flag ("dcngettext:3:pass-php-format");
132 xgettext_record_flag ("sprintf:1:php-format");
133 xgettext_record_flag ("printf:1:php-format");
134 }
135
136
137 /* ======================== Reading of characters. ======================== */
138
139 /* The input file stream. */
140 static FILE *fp;
141
142
143 /* 1. line_number handling. */
144
145 static unsigned char phase1_pushback[2];
146 static int phase1_pushback_length;
147
148 static int
phase1_getc()149 phase1_getc ()
150 {
151 int c;
152
153 if (phase1_pushback_length)
154 c = phase1_pushback[--phase1_pushback_length];
155 else
156 {
157 c = getc (fp);
158
159 if (c == EOF)
160 {
161 if (ferror (fp))
162 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
163 real_file_name);
164 return EOF;
165 }
166 }
167
168 if (c == '\n')
169 line_number++;
170
171 return c;
172 }
173
174 /* Supports 2 characters of pushback. */
175 static void
phase1_ungetc(int c)176 phase1_ungetc (int c)
177 {
178 if (c != EOF)
179 {
180 if (c == '\n')
181 --line_number;
182
183 if (phase1_pushback_length == SIZEOF (phase1_pushback))
184 abort ();
185 phase1_pushback[phase1_pushback_length++] = c;
186 }
187 }
188
189
190 /* 2. Ignore HTML sections. They are equivalent to PHP echo commands and
191 therefore don't contain translatable strings. */
192
193 static void
skip_html()194 skip_html ()
195 {
196 for (;;)
197 {
198 int c = phase1_getc ();
199
200 if (c == EOF)
201 return;
202
203 if (c == '<')
204 {
205 int c2 = phase1_getc ();
206
207 if (c2 == EOF)
208 break;
209
210 if (c2 == '?')
211 {
212 /* <?php is the normal way to enter PHP mode. <? and <?= are
213 recognized by PHP depending on a configuration setting. */
214 int c3 = phase1_getc ();
215
216 if (c3 != '=')
217 phase1_ungetc (c3);
218
219 return;
220 }
221
222 if (c2 == '%')
223 {
224 /* <% and <%= are recognized by PHP depending on a configuration
225 setting. */
226 int c3 = phase1_getc ();
227
228 if (c3 != '=')
229 phase1_ungetc (c3);
230
231 return;
232 }
233
234 if (c2 == '<')
235 {
236 phase1_ungetc (c2);
237 continue;
238 }
239
240 /* < script language = php >
241 < script language = "php" >
242 < script language = 'php' >
243 are always recognized. */
244 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
245 c2 = phase1_getc ();
246 if (c2 != 's' && c2 != 'S')
247 {
248 phase1_ungetc (c2);
249 continue;
250 }
251 c2 = phase1_getc ();
252 if (c2 != 'c' && c2 != 'C')
253 {
254 phase1_ungetc (c2);
255 continue;
256 }
257 c2 = phase1_getc ();
258 if (c2 != 'r' && c2 != 'R')
259 {
260 phase1_ungetc (c2);
261 continue;
262 }
263 c2 = phase1_getc ();
264 if (c2 != 'i' && c2 != 'I')
265 {
266 phase1_ungetc (c2);
267 continue;
268 }
269 c2 = phase1_getc ();
270 if (c2 != 'p' && c2 != 'P')
271 {
272 phase1_ungetc (c2);
273 continue;
274 }
275 c2 = phase1_getc ();
276 if (c2 != 't' && c2 != 'T')
277 {
278 phase1_ungetc (c2);
279 continue;
280 }
281 c2 = phase1_getc ();
282 if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'))
283 {
284 phase1_ungetc (c2);
285 continue;
286 }
287 do
288 c2 = phase1_getc ();
289 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
290 if (c2 != 'l' && c2 != 'L')
291 {
292 phase1_ungetc (c2);
293 continue;
294 }
295 c2 = phase1_getc ();
296 if (c2 != 'a' && c2 != 'A')
297 {
298 phase1_ungetc (c2);
299 continue;
300 }
301 c2 = phase1_getc ();
302 if (c2 != 'n' && c2 != 'N')
303 {
304 phase1_ungetc (c2);
305 continue;
306 }
307 c2 = phase1_getc ();
308 if (c2 != 'g' && c2 != 'G')
309 {
310 phase1_ungetc (c2);
311 continue;
312 }
313 c2 = phase1_getc ();
314 if (c2 != 'u' && c2 != 'U')
315 {
316 phase1_ungetc (c2);
317 continue;
318 }
319 c2 = phase1_getc ();
320 if (c2 != 'a' && c2 != 'A')
321 {
322 phase1_ungetc (c2);
323 continue;
324 }
325 c2 = phase1_getc ();
326 if (c2 != 'g' && c2 != 'G')
327 {
328 phase1_ungetc (c2);
329 continue;
330 }
331 c2 = phase1_getc ();
332 if (c2 != 'e' && c2 != 'E')
333 {
334 phase1_ungetc (c2);
335 continue;
336 }
337 c2 = phase1_getc ();
338 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
339 c2 = phase1_getc ();
340 if (c2 != '=')
341 {
342 phase1_ungetc (c2);
343 continue;
344 }
345 c2 = phase1_getc ();
346 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
347 c2 = phase1_getc ();
348 if (c2 == '"')
349 {
350 c2 = phase1_getc ();
351 if (c2 != 'p')
352 {
353 phase1_ungetc (c2);
354 continue;
355 }
356 c2 = phase1_getc ();
357 if (c2 != 'h')
358 {
359 phase1_ungetc (c2);
360 continue;
361 }
362 c2 = phase1_getc ();
363 if (c2 != 'p')
364 {
365 phase1_ungetc (c2);
366 continue;
367 }
368 c2 = phase1_getc ();
369 if (c2 != '"')
370 {
371 phase1_ungetc (c2);
372 continue;
373 }
374 }
375 else if (c2 == '\'')
376 {
377 c2 = phase1_getc ();
378 if (c2 != 'p')
379 {
380 phase1_ungetc (c2);
381 continue;
382 }
383 c2 = phase1_getc ();
384 if (c2 != 'h')
385 {
386 phase1_ungetc (c2);
387 continue;
388 }
389 c2 = phase1_getc ();
390 if (c2 != 'p')
391 {
392 phase1_ungetc (c2);
393 continue;
394 }
395 c2 = phase1_getc ();
396 if (c2 != '\'')
397 {
398 phase1_ungetc (c2);
399 continue;
400 }
401 }
402 else
403 {
404 if (c2 != 'p')
405 {
406 phase1_ungetc (c2);
407 continue;
408 }
409 c2 = phase1_getc ();
410 if (c2 != 'h')
411 {
412 phase1_ungetc (c2);
413 continue;
414 }
415 c2 = phase1_getc ();
416 if (c2 != 'p')
417 {
418 phase1_ungetc (c2);
419 continue;
420 }
421 }
422 c2 = phase1_getc ();
423 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
424 c2 = phase1_getc ();
425 if (c2 != '>')
426 {
427 phase1_ungetc (c2);
428 continue;
429 }
430 return;
431 }
432 }
433 }
434
435 #if 0
436
437 static unsigned char phase2_pushback[1];
438 static int phase2_pushback_length;
439
440 static int
441 phase2_getc ()
442 {
443 int c;
444
445 if (phase2_pushback_length)
446 return phase2_pushback[--phase2_pushback_length];
447
448 c = phase1_getc ();
449 switch (c)
450 {
451 case '?':
452 case '%':
453 {
454 int c2 = phase1_getc ();
455 if (c2 == '>')
456 {
457 /* ?> and %> terminate PHP mode and switch back to HTML mode. */
458 skip_html ();
459 return ' ';
460 }
461 phase1_ungetc (c2);
462 }
463 break;
464
465 case '<':
466 {
467 int c2 = phase1_getc ();
468
469 /* < / script > terminates PHP mode and switches back to HTML mode. */
470 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
471 c2 = phase1_getc ();
472 if (c2 == '/')
473 {
474 do
475 c2 = phase1_getc ();
476 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
477 if (c2 == 's' || c2 == 'S')
478 {
479 c2 = phase1_getc ();
480 if (c2 == 'c' || c2 == 'C')
481 {
482 c2 = phase1_getc ();
483 if (c2 == 'r' || c2 == 'R')
484 {
485 c2 = phase1_getc ();
486 if (c2 == 'i' || c2 == 'I')
487 {
488 c2 = phase1_getc ();
489 if (c2 == 'p' || c2 == 'P')
490 {
491 c2 = phase1_getc ();
492 if (c2 == 't' || c2 == 'T')
493 {
494 do
495 c2 = phase1_getc ();
496 while (c2 == ' ' || c2 == '\t'
497 || c2 == '\n' || c2 == '\r');
498 if (c2 == '>')
499 {
500 skip_html ();
501 return ' ';
502 }
503 }
504 }
505 }
506 }
507 }
508 }
509 }
510 phase1_ungetc (c2);
511 }
512 break;
513 }
514
515 return c;
516 }
517
518 static void
519 phase2_ungetc (int c)
520 {
521 if (c != EOF)
522 {
523 if (phase2_pushback_length == SIZEOF (phase2_pushback))
524 abort ();
525 phase2_pushback[phase2_pushback_length++] = c;
526 }
527 }
528
529 #endif
530
531
532 /* Accumulating comments. */
533
534 static char *buffer;
535 static size_t bufmax;
536 static size_t buflen;
537
538 static inline void
comment_start()539 comment_start ()
540 {
541 buflen = 0;
542 }
543
544 static inline void
comment_add(int c)545 comment_add (int c)
546 {
547 if (buflen >= bufmax)
548 {
549 bufmax = 2 * bufmax + 10;
550 buffer = xrealloc (buffer, bufmax);
551 }
552 buffer[buflen++] = c;
553 }
554
555 static inline void
comment_line_end(size_t chars_to_remove)556 comment_line_end (size_t chars_to_remove)
557 {
558 buflen -= chars_to_remove;
559 while (buflen >= 1
560 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
561 --buflen;
562 if (chars_to_remove == 0 && buflen >= bufmax)
563 {
564 bufmax = 2 * bufmax + 10;
565 buffer = xrealloc (buffer, bufmax);
566 }
567 buffer[buflen] = '\0';
568 savable_comment_add (buffer);
569 }
570
571
572 /* 3. Replace each comment that is not inside a string literal with a
573 space character. We need to remember the comment for later, because
574 it may be attached to a keyword string. */
575
576 /* These are for tracking whether comments count as immediately before
577 keyword. */
578 static int last_comment_line;
579 static int last_non_comment_line;
580
581 static unsigned char phase3_pushback[1];
582 static int phase3_pushback_length;
583
584 static int
phase3_getc()585 phase3_getc ()
586 {
587 int lineno;
588 int c;
589
590 if (phase3_pushback_length)
591 return phase3_pushback[--phase3_pushback_length];
592
593 c = phase1_getc ();
594
595 if (c == '#')
596 {
597 /* sh comment. */
598 bool last_was_qmark = false;
599
600 comment_start ();
601 lineno = line_number;
602 for (;;)
603 {
604 c = phase1_getc ();
605 if (c == '\n' || c == EOF)
606 {
607 comment_line_end (0);
608 break;
609 }
610 if (last_was_qmark && c == '>')
611 {
612 comment_line_end (1);
613 skip_html ();
614 break;
615 }
616 /* We skip all leading white space, but not EOLs. */
617 if (!(buflen == 0 && (c == ' ' || c == '\t')))
618 comment_add (c);
619 last_was_qmark = (c == '?' || c == '%');
620 }
621 last_comment_line = lineno;
622 return '\n';
623 }
624 else if (c == '/')
625 {
626 c = phase1_getc ();
627
628 switch (c)
629 {
630 default:
631 phase1_ungetc (c);
632 return '/';
633
634 case '*':
635 {
636 /* C comment. */
637 bool last_was_star;
638
639 comment_start ();
640 lineno = line_number;
641 last_was_star = false;
642 for (;;)
643 {
644 c = phase1_getc ();
645 if (c == EOF)
646 break;
647 /* We skip all leading white space, but not EOLs. */
648 if (buflen == 0 && (c == ' ' || c == '\t'))
649 continue;
650 comment_add (c);
651 switch (c)
652 {
653 case '\n':
654 comment_line_end (1);
655 comment_start ();
656 lineno = line_number;
657 last_was_star = false;
658 continue;
659
660 case '*':
661 last_was_star = true;
662 continue;
663
664 case '/':
665 if (last_was_star)
666 {
667 comment_line_end (2);
668 break;
669 }
670 /* FALLTHROUGH */
671
672 default:
673 last_was_star = false;
674 continue;
675 }
676 break;
677 }
678 last_comment_line = lineno;
679 return ' ';
680 }
681
682 case '/':
683 {
684 /* C++ comment. */
685 bool last_was_qmark = false;
686
687 comment_start ();
688 lineno = line_number;
689 for (;;)
690 {
691 c = phase1_getc ();
692 if (c == '\n' || c == EOF)
693 {
694 comment_line_end (0);
695 break;
696 }
697 if (last_was_qmark && c == '>')
698 {
699 comment_line_end (1);
700 skip_html ();
701 break;
702 }
703 /* We skip all leading white space, but not EOLs. */
704 if (!(buflen == 0 && (c == ' ' || c == '\t')))
705 comment_add (c);
706 last_was_qmark = (c == '?' || c == '%');
707 }
708 last_comment_line = lineno;
709 return '\n';
710 }
711 }
712 }
713 else
714 return c;
715 }
716
717 #ifdef unused
718 static void
phase3_ungetc(int c)719 phase3_ungetc (int c)
720 {
721 if (c != EOF)
722 {
723 if (phase3_pushback_length == SIZEOF (phase3_pushback))
724 abort ();
725 phase3_pushback[phase3_pushback_length++] = c;
726 }
727 }
728 #endif
729
730
731 /* ========================== Reading of tokens. ========================== */
732
733
734 enum token_type_ty
735 {
736 token_type_eof,
737 token_type_lparen, /* ( */
738 token_type_rparen, /* ) */
739 token_type_comma, /* , */
740 token_type_lbracket, /* [ */
741 token_type_rbracket, /* ] */
742 token_type_dot, /* . */
743 token_type_operator1, /* * / % ++ -- */
744 token_type_operator2, /* + - ! ~ @ */
745 token_type_string_literal, /* "abc" */
746 token_type_symbol, /* symbol, number */
747 token_type_other /* misc. operator */
748 };
749 typedef enum token_type_ty token_type_ty;
750
751 typedef struct token_ty token_ty;
752 struct token_ty
753 {
754 token_type_ty type;
755 char *string; /* for token_type_string_literal, token_type_symbol */
756 refcounted_string_list_ty *comment; /* for token_type_string_literal */
757 int line_number;
758 };
759
760
761 /* Free the memory pointed to by a 'struct token_ty'. */
762 static inline void
free_token(token_ty * tp)763 free_token (token_ty *tp)
764 {
765 if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
766 free (tp->string);
767 if (tp->type == token_type_string_literal)
768 drop_reference (tp->comment);
769 }
770
771
772 /* 4. Combine characters into tokens. Discard whitespace. */
773
774 static token_ty phase4_pushback[3];
775 static int phase4_pushback_length;
776
777 static void
phase4_get(token_ty * tp)778 phase4_get (token_ty *tp)
779 {
780 static char *buffer;
781 static int bufmax;
782 int bufpos;
783 int c;
784
785 if (phase4_pushback_length)
786 {
787 *tp = phase4_pushback[--phase4_pushback_length];
788 return;
789 }
790 tp->string = NULL;
791
792 for (;;)
793 {
794 tp->line_number = line_number;
795 c = phase3_getc ();
796 switch (c)
797 {
798 case EOF:
799 tp->type = token_type_eof;
800 return;
801
802 case '\n':
803 if (last_non_comment_line > last_comment_line)
804 savable_comment_reset ();
805 /* FALLTHROUGH */
806 case ' ':
807 case '\t':
808 case '\r':
809 /* Ignore whitespace. */
810 continue;
811 }
812
813 last_non_comment_line = tp->line_number;
814
815 switch (c)
816 {
817 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
818 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
819 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
820 case 'V': case 'W': case 'X': case 'Y': case 'Z':
821 case '_':
822 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
823 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
824 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
825 case 'v': case 'w': case 'x': case 'y': case 'z':
826 case 127: case 128: case 129: case 130: case 131: case 132: case 133:
827 case 134: case 135: case 136: case 137: case 138: case 139: case 140:
828 case 141: case 142: case 143: case 144: case 145: case 146: case 147:
829 case 148: case 149: case 150: case 151: case 152: case 153: case 154:
830 case 155: case 156: case 157: case 158: case 159: case 160: case 161:
831 case 162: case 163: case 164: case 165: case 166: case 167: case 168:
832 case 169: case 170: case 171: case 172: case 173: case 174: case 175:
833 case 176: case 177: case 178: case 179: case 180: case 181: case 182:
834 case 183: case 184: case 185: case 186: case 187: case 188: case 189:
835 case 190: case 191: case 192: case 193: case 194: case 195: case 196:
836 case 197: case 198: case 199: case 200: case 201: case 202: case 203:
837 case 204: case 205: case 206: case 207: case 208: case 209: case 210:
838 case 211: case 212: case 213: case 214: case 215: case 216: case 217:
839 case 218: case 219: case 220: case 221: case 222: case 223: case 224:
840 case 225: case 226: case 227: case 228: case 229: case 230: case 231:
841 case 232: case 233: case 234: case 235: case 236: case 237: case 238:
842 case 239: case 240: case 241: case 242: case 243: case 244: case 245:
843 case 246: case 247: case 248: case 249: case 250: case 251: case 252:
844 case 253: case 254: case 255:
845 bufpos = 0;
846 for (;;)
847 {
848 if (bufpos >= bufmax)
849 {
850 bufmax = 2 * bufmax + 10;
851 buffer = xrealloc (buffer, bufmax);
852 }
853 buffer[bufpos++] = c;
854 c = phase1_getc ();
855 switch (c)
856 {
857 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
858 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
859 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
860 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
861 case 'Y': case 'Z':
862 case '_':
863 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
864 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
865 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
866 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
867 case 'y': case 'z':
868 case '0': case '1': case '2': case '3': case '4':
869 case '5': case '6': case '7': case '8': case '9':
870 case 127: case 128: case 129: case 130: case 131: case 132:
871 case 133: case 134: case 135: case 136: case 137: case 138:
872 case 139: case 140: case 141: case 142: case 143: case 144:
873 case 145: case 146: case 147: case 148: case 149: case 150:
874 case 151: case 152: case 153: case 154: case 155: case 156:
875 case 157: case 158: case 159: case 160: case 161: case 162:
876 case 163: case 164: case 165: case 166: case 167: case 168:
877 case 169: case 170: case 171: case 172: case 173: case 174:
878 case 175: case 176: case 177: case 178: case 179: case 180:
879 case 181: case 182: case 183: case 184: case 185: case 186:
880 case 187: case 188: case 189: case 190: case 191: case 192:
881 case 193: case 194: case 195: case 196: case 197: case 198:
882 case 199: case 200: case 201: case 202: case 203: case 204:
883 case 205: case 206: case 207: case 208: case 209: case 210:
884 case 211: case 212: case 213: case 214: case 215: case 216:
885 case 217: case 218: case 219: case 220: case 221: case 222:
886 case 223: case 224: case 225: case 226: case 227: case 228:
887 case 229: case 230: case 231: case 232: case 233: case 234:
888 case 235: case 236: case 237: case 238: case 239: case 240:
889 case 241: case 242: case 243: case 244: case 245: case 246:
890 case 247: case 248: case 249: case 250: case 251: case 252:
891 case 253: case 254: case 255:
892 continue;
893
894 default:
895 phase1_ungetc (c);
896 break;
897 }
898 break;
899 }
900 if (bufpos >= bufmax)
901 {
902 bufmax = 2 * bufmax + 10;
903 buffer = xrealloc (buffer, bufmax);
904 }
905 buffer[bufpos] = 0;
906 tp->string = xstrdup (buffer);
907 tp->type = token_type_symbol;
908 return;
909
910 case '\'':
911 /* Single-quoted string literal. */
912 bufpos = 0;
913 for (;;)
914 {
915 c = phase1_getc ();
916 if (c == EOF || c == '\'')
917 break;
918 if (c == '\\')
919 {
920 c = phase1_getc ();
921 if (c != '\\' && c != '\'')
922 {
923 phase1_ungetc (c);
924 c = '\\';
925 }
926 }
927 if (bufpos >= bufmax)
928 {
929 bufmax = 2 * bufmax + 10;
930 buffer = xrealloc (buffer, bufmax);
931 }
932 buffer[bufpos++] = c;
933 }
934 if (bufpos >= bufmax)
935 {
936 bufmax = 2 * bufmax + 10;
937 buffer = xrealloc (buffer, bufmax);
938 }
939 buffer[bufpos] = 0;
940 tp->type = token_type_string_literal;
941 tp->string = xstrdup (buffer);
942 tp->comment = add_reference (savable_comment);
943 return;
944
945 case '"':
946 /* Double-quoted string literal. */
947 tp->type = token_type_string_literal;
948 bufpos = 0;
949 for (;;)
950 {
951 c = phase1_getc ();
952 if (c == EOF || c == '"')
953 break;
954 if (c == '$')
955 {
956 c = phase1_getc ();
957 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
958 || c == '_' || c == '{' || c >= 0x7f)
959 {
960 /* String with variables. */
961 tp->type = token_type_other;
962 continue;
963 }
964 phase1_ungetc (c);
965 c = '$';
966 }
967 if (c == '{')
968 {
969 c = phase1_getc ();
970 if (c == '$')
971 {
972 /* String with expressions. */
973 tp->type = token_type_other;
974 continue;
975 }
976 phase1_ungetc (c);
977 c = '{';
978 }
979 if (c == '\\')
980 {
981 int n, j;
982
983 c = phase1_getc ();
984 switch (c)
985 {
986 case '"':
987 case '\\':
988 case '$':
989 break;
990
991 case '0': case '1': case '2': case '3':
992 case '4': case '5': case '6': case '7':
993 n = 0;
994 for (j = 0; j < 3; ++j)
995 {
996 n = n * 8 + c - '0';
997 c = phase1_getc ();
998 switch (c)
999 {
1000 default:
1001 break;
1002
1003 case '0': case '1': case '2': case '3':
1004 case '4': case '5': case '6': case '7':
1005 continue;
1006 }
1007 break;
1008 }
1009 phase1_ungetc (c);
1010 c = n;
1011 break;
1012
1013 case 'x':
1014 n = 0;
1015 for (j = 0; j < 2; ++j)
1016 {
1017 c = phase1_getc ();
1018 switch (c)
1019 {
1020 case '0': case '1': case '2': case '3': case '4':
1021 case '5': case '6': case '7': case '8': case '9':
1022 n = n * 16 + c - '0';
1023 break;
1024 case 'A': case 'B': case 'C': case 'D': case 'E':
1025 case 'F':
1026 n = n * 16 + 10 + c - 'A';
1027 break;
1028 case 'a': case 'b': case 'c': case 'd': case 'e':
1029 case 'f':
1030 n = n * 16 + 10 + c - 'a';
1031 break;
1032 default:
1033 phase1_ungetc (c);
1034 c = 0;
1035 break;
1036 }
1037 if (c == 0)
1038 break;
1039 }
1040 if (j == 0)
1041 {
1042 phase1_ungetc ('x');
1043 c = '\\';
1044 }
1045 else
1046 c = n;
1047 break;
1048
1049 case 'n':
1050 c = '\n';
1051 break;
1052 case 't':
1053 c = '\t';
1054 break;
1055 case 'r':
1056 c = '\r';
1057 break;
1058
1059 default:
1060 phase1_ungetc (c);
1061 c = '\\';
1062 break;
1063 }
1064 }
1065 if (bufpos >= bufmax)
1066 {
1067 bufmax = 2 * bufmax + 10;
1068 buffer = xrealloc (buffer, bufmax);
1069 }
1070 buffer[bufpos++] = c;
1071 }
1072 if (bufpos >= bufmax)
1073 {
1074 bufmax = 2 * bufmax + 10;
1075 buffer = xrealloc (buffer, bufmax);
1076 }
1077 buffer[bufpos] = 0;
1078 if (tp->type == token_type_string_literal)
1079 {
1080 tp->string = xstrdup (buffer);
1081 tp->comment = add_reference (savable_comment);
1082 }
1083 return;
1084
1085 case '?':
1086 case '%':
1087 {
1088 int c2 = phase1_getc ();
1089 if (c2 == '>')
1090 {
1091 /* ?> and %> terminate PHP mode and switch back to HTML
1092 mode. */
1093 skip_html ();
1094 tp->type = token_type_other;
1095 }
1096 else
1097 {
1098 phase1_ungetc (c2);
1099 tp->type = (c == '%' ? token_type_operator1 : token_type_other);
1100 }
1101 return;
1102 }
1103
1104 case '(':
1105 tp->type = token_type_lparen;
1106 return;
1107
1108 case ')':
1109 tp->type = token_type_rparen;
1110 return;
1111
1112 case ',':
1113 tp->type = token_type_comma;
1114 return;
1115
1116 case '[':
1117 tp->type = token_type_lbracket;
1118 return;
1119
1120 case ']':
1121 tp->type = token_type_rbracket;
1122 return;
1123
1124 case '.':
1125 tp->type = token_type_dot;
1126 return;
1127
1128 case '*':
1129 case '/':
1130 tp->type = token_type_operator1;
1131 return;
1132
1133 case '+':
1134 case '-':
1135 {
1136 int c2 = phase1_getc ();
1137 if (c2 == c)
1138 /* ++ or -- */
1139 tp->type = token_type_operator1;
1140 else
1141 /* + or - */
1142 {
1143 phase1_ungetc (c2);
1144 tp->type = token_type_operator2;
1145 }
1146 return;
1147 }
1148
1149 case '!':
1150 case '~':
1151 case '@':
1152 tp->type = token_type_operator2;
1153 return;
1154
1155 case '<':
1156 {
1157 int c2 = phase1_getc ();
1158 if (c2 == '<')
1159 {
1160 int c3 = phase1_getc ();
1161 if (c3 == '<')
1162 {
1163 int label_start = 0;
1164
1165 /* Start of here and now document.
1166 Parse whitespace, then label, then newline. */
1167 do
1168 c = phase3_getc ();
1169 while (c == ' ' || c == '\t' || c == '\n' || c == '\r');
1170
1171 bufpos = 0;
1172 do
1173 {
1174 if (bufpos >= bufmax)
1175 {
1176 bufmax = 2 * bufmax + 10;
1177 buffer = xrealloc (buffer, bufmax);
1178 }
1179 buffer[bufpos++] = c;
1180 c = phase3_getc ();
1181 }
1182 while (c != EOF && c != '\n' && c != '\r');
1183 /* buffer[0..bufpos-1] now contains the label
1184 (including single or double quotes). */
1185
1186 if (*buffer == '\'' || *buffer == '"')
1187 {
1188 label_start++;
1189 bufpos--;
1190 }
1191
1192 /* Now skip the here document. */
1193 for (;;)
1194 {
1195 c = phase1_getc ();
1196 if (c == EOF)
1197 break;
1198 if (c == '\n' || c == '\r')
1199 {
1200 int bufidx = label_start;
1201
1202 while (bufidx < bufpos)
1203 {
1204 c = phase1_getc ();
1205 if (c == EOF)
1206 break;
1207 if (c != buffer[bufidx])
1208 {
1209 phase1_ungetc (c);
1210 break;
1211 }
1212 bufidx++;
1213 }
1214 if (bufidx == bufpos)
1215 {
1216 c = phase1_getc ();
1217 if (c != ';')
1218 phase1_ungetc (c);
1219 c = phase1_getc ();
1220 if (c == '\n' || c == '\r')
1221 break;
1222 }
1223 }
1224 }
1225
1226 /* FIXME: Ideally we should turn the here document into a
1227 string literal if it didn't contain $ substitution. And
1228 we should also respect backslash escape sequences like
1229 in double-quoted strings. */
1230 tp->type = token_type_other;
1231 return;
1232 }
1233 phase1_ungetc (c3);
1234 }
1235
1236 /* < / script > terminates PHP mode and switches back to HTML
1237 mode. */
1238 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
1239 c2 = phase1_getc ();
1240 if (c2 == '/')
1241 {
1242 do
1243 c2 = phase1_getc ();
1244 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
1245 if (c2 == 's' || c2 == 'S')
1246 {
1247 c2 = phase1_getc ();
1248 if (c2 == 'c' || c2 == 'C')
1249 {
1250 c2 = phase1_getc ();
1251 if (c2 == 'r' || c2 == 'R')
1252 {
1253 c2 = phase1_getc ();
1254 if (c2 == 'i' || c2 == 'I')
1255 {
1256 c2 = phase1_getc ();
1257 if (c2 == 'p' || c2 == 'P')
1258 {
1259 c2 = phase1_getc ();
1260 if (c2 == 't' || c2 == 'T')
1261 {
1262 do
1263 c2 = phase1_getc ();
1264 while (c2 == ' ' || c2 == '\t'
1265 || c2 == '\n' || c2 == '\r');
1266 if (c2 == '>')
1267 {
1268 skip_html ();
1269 }
1270 else
1271 phase1_ungetc (c2);
1272 }
1273 else
1274 phase1_ungetc (c2);
1275 }
1276 else
1277 phase1_ungetc (c2);
1278 }
1279 else
1280 phase1_ungetc (c2);
1281 }
1282 else
1283 phase1_ungetc (c2);
1284 }
1285 else
1286 phase1_ungetc (c2);
1287 }
1288 else
1289 phase1_ungetc (c2);
1290 }
1291 else
1292 phase1_ungetc (c2);
1293
1294 tp->type = token_type_other;
1295 return;
1296 }
1297
1298 case '`':
1299 /* Execution operator. */
1300 default:
1301 /* We could carefully recognize each of the 2 and 3 character
1302 operators, but it is not necessary, as we only need to recognize
1303 gettext invocations. Don't bother. */
1304 tp->type = token_type_other;
1305 return;
1306 }
1307 }
1308 }
1309
1310 /* Supports 3 tokens of pushback. */
1311 static void
phase4_unget(token_ty * tp)1312 phase4_unget (token_ty *tp)
1313 {
1314 if (tp->type != token_type_eof)
1315 {
1316 if (phase4_pushback_length == SIZEOF (phase4_pushback))
1317 abort ();
1318 phase4_pushback[phase4_pushback_length++] = *tp;
1319 }
1320 }
1321
1322
1323 /* 5. Compile-time optimization of string literal concatenation.
1324 Combine "string1" . ... . "stringN" to the concatenated string if
1325 - the token before this expression is none of
1326 '+' '-' '.' '*' '/' '%' '!' '~' '++' '--' ')' '@'
1327 (because then the first string could be part of an expression with
1328 the same or higher precedence as '.', such as an additive,
1329 multiplicative, negation, preincrement, or cast expression),
1330 - the token after this expression is none of
1331 '*' '/' '%' '++' '--'
1332 (because then the last string could be part of an expression with
1333 higher precedence as '.', such as a multiplicative or postincrement
1334 expression). */
1335
1336 static token_type_ty phase5_last;
1337
1338 static void
x_php_lex(token_ty * tp)1339 x_php_lex (token_ty *tp)
1340 {
1341 phase4_get (tp);
1342 if (tp->type == token_type_string_literal
1343 && !(phase5_last == token_type_dot
1344 || phase5_last == token_type_operator1
1345 || phase5_last == token_type_operator2
1346 || phase5_last == token_type_rparen))
1347 {
1348 char *sum = tp->string;
1349 size_t sum_len = strlen (sum);
1350
1351 for (;;)
1352 {
1353 token_ty token2;
1354
1355 phase4_get (&token2);
1356 if (token2.type == token_type_dot)
1357 {
1358 token_ty token3;
1359
1360 phase4_get (&token3);
1361 if (token3.type == token_type_string_literal)
1362 {
1363 token_ty token_after;
1364
1365 phase4_get (&token_after);
1366 if (token_after.type != token_type_operator1)
1367 {
1368 char *addend = token3.string;
1369 size_t addend_len = strlen (addend);
1370
1371 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1372 memcpy (sum + sum_len, addend, addend_len + 1);
1373 sum_len += addend_len;
1374
1375 phase4_unget (&token_after);
1376 free_token (&token3);
1377 free_token (&token2);
1378 continue;
1379 }
1380 phase4_unget (&token_after);
1381 }
1382 phase4_unget (&token3);
1383 }
1384 phase4_unget (&token2);
1385 break;
1386 }
1387 tp->string = sum;
1388 }
1389 phase5_last = tp->type;
1390 }
1391
1392
1393 /* ========================= Extracting strings. ========================== */
1394
1395
1396 /* Context lookup table. */
1397 static flag_context_list_table_ty *flag_context_list_table;
1398
1399
1400 /* The file is broken into tokens. Scan the token stream, looking for
1401 a keyword, followed by a left paren, followed by a string. When we
1402 see this sequence, we have something to remember. We assume we are
1403 looking at a valid C or C++ program, and leave the complaints about
1404 the grammar to the compiler.
1405
1406 Normal handling: Look for
1407 keyword ( ... msgid ... )
1408 Plural handling: Look for
1409 keyword ( ... msgid ... msgid_plural ... )
1410
1411 We use recursion because the arguments before msgid or between msgid
1412 and msgid_plural can contain subexpressions of the same form. */
1413
1414
1415 /* Extract messages until the next balanced closing parenthesis or bracket.
1416 Extracted messages are added to MLP.
1417 DELIM can be either token_type_rparen or token_type_rbracket, or
1418 token_type_eof to accept both.
1419 Return true upon eof, false upon closing parenthesis or bracket. */
1420 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1421 extract_balanced (message_list_ty *mlp,
1422 token_type_ty delim,
1423 flag_context_ty outer_context,
1424 flag_context_list_iterator_ty context_iter,
1425 struct arglist_parser *argparser)
1426 {
1427 /* Current argument number. */
1428 int arg = 1;
1429 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1430 int state;
1431 /* Parameters of the keyword just seen. Defined only in state 1. */
1432 const struct callshapes *next_shapes = NULL;
1433 /* Context iterator that will be used if the next token is a '('. */
1434 flag_context_list_iterator_ty next_context_iter =
1435 passthrough_context_list_iterator;
1436 /* Current context. */
1437 flag_context_ty inner_context =
1438 inherited_context (outer_context,
1439 flag_context_list_iterator_advance (&context_iter));
1440
1441 /* Start state is 0. */
1442 state = 0;
1443
1444 for (;;)
1445 {
1446 token_ty token;
1447
1448 x_php_lex (&token);
1449 switch (token.type)
1450 {
1451 case token_type_symbol:
1452 {
1453 void *keyword_value;
1454
1455 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1456 &keyword_value)
1457 == 0)
1458 {
1459 next_shapes = (const struct callshapes *) keyword_value;
1460 state = 1;
1461 }
1462 else
1463 state = 0;
1464 }
1465 next_context_iter =
1466 flag_context_list_iterator (
1467 flag_context_list_table_lookup (
1468 flag_context_list_table,
1469 token.string, strlen (token.string)));
1470 free (token.string);
1471 continue;
1472
1473 case token_type_lparen:
1474 if (extract_balanced (mlp, token_type_rparen,
1475 inner_context, next_context_iter,
1476 arglist_parser_alloc (mlp,
1477 state ? next_shapes : NULL)))
1478 {
1479 arglist_parser_done (argparser, arg);
1480 return true;
1481 }
1482 next_context_iter = null_context_list_iterator;
1483 state = 0;
1484 continue;
1485
1486 case token_type_rparen:
1487 if (delim == token_type_rparen || delim == token_type_eof)
1488 {
1489 arglist_parser_done (argparser, arg);
1490 return false;
1491 }
1492 next_context_iter = null_context_list_iterator;
1493 state = 0;
1494 continue;
1495
1496 case token_type_comma:
1497 arg++;
1498 inner_context =
1499 inherited_context (outer_context,
1500 flag_context_list_iterator_advance (
1501 &context_iter));
1502 next_context_iter = passthrough_context_list_iterator;
1503 state = 0;
1504 continue;
1505
1506 case token_type_lbracket:
1507 if (extract_balanced (mlp, token_type_rbracket,
1508 null_context, null_context_list_iterator,
1509 arglist_parser_alloc (mlp, NULL)))
1510 {
1511 arglist_parser_done (argparser, arg);
1512 return true;
1513 }
1514 next_context_iter = null_context_list_iterator;
1515 state = 0;
1516 continue;
1517
1518 case token_type_rbracket:
1519 if (delim == token_type_rbracket || delim == token_type_eof)
1520 {
1521 arglist_parser_done (argparser, arg);
1522 return false;
1523 }
1524 next_context_iter = null_context_list_iterator;
1525 state = 0;
1526 continue;
1527
1528 case token_type_string_literal:
1529 {
1530 lex_pos_ty pos;
1531 pos.file_name = logical_file_name;
1532 pos.line_number = token.line_number;
1533
1534 if (extract_all)
1535 remember_a_message (mlp, NULL, token.string, false, false,
1536 inner_context, &pos,
1537 NULL, token.comment, false);
1538 else
1539 {
1540 mixed_string_ty *ms =
1541 mixed_string_alloc_simple (token.string, lc_string,
1542 pos.file_name, pos.line_number);
1543 free (token.string);
1544 arglist_parser_remember (argparser, arg, ms, inner_context,
1545 pos.file_name, pos.line_number,
1546 token.comment, false);
1547 }
1548 drop_reference (token.comment);
1549 }
1550 next_context_iter = null_context_list_iterator;
1551 state = 0;
1552 continue;
1553
1554 case token_type_dot:
1555 case token_type_operator1:
1556 case token_type_operator2:
1557 case token_type_other:
1558 next_context_iter = null_context_list_iterator;
1559 state = 0;
1560 continue;
1561
1562 case token_type_eof:
1563 arglist_parser_done (argparser, arg);
1564 return true;
1565
1566 default:
1567 abort ();
1568 }
1569 }
1570 }
1571
1572
1573 void
extract_php(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1574 extract_php (FILE *f,
1575 const char *real_filename, const char *logical_filename,
1576 flag_context_list_table_ty *flag_table,
1577 msgdomain_list_ty *mdlp)
1578 {
1579 message_list_ty *mlp = mdlp->item[0]->messages;
1580
1581 fp = f;
1582 real_file_name = real_filename;
1583 logical_file_name = xstrdup (logical_filename);
1584 line_number = 1;
1585
1586 phase1_pushback_length = 0;
1587 #if 0
1588 phase2_pushback_length = 0;
1589 #endif
1590
1591 last_comment_line = -1;
1592 last_non_comment_line = -1;
1593
1594 phase3_pushback_length = 0;
1595 phase4_pushback_length = 0;
1596
1597 phase5_last = token_type_eof;
1598
1599 flag_context_list_table = flag_table;
1600
1601 init_keywords ();
1602
1603 /* Initial mode is HTML mode, not PHP mode. */
1604 skip_html ();
1605
1606 /* Eat tokens until eof is seen. When extract_balanced returns
1607 due to an unbalanced closing parenthesis, just restart it. */
1608 while (!extract_balanced (mlp, token_type_eof,
1609 null_context, null_context_list_iterator,
1610 arglist_parser_alloc (mlp, NULL)))
1611 ;
1612
1613 /* Close scanner. */
1614 fp = NULL;
1615 real_file_name = NULL;
1616 logical_file_name = NULL;
1617 line_number = 0;
1618 }
1619