1 /* xgettext C# backend.
2 Copyright (C) 2003-2009, 2011, 2014, 2018-2020 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #ifdef HAVE_CONFIG_H
19 # include "config.h"
20 #endif
21
22 /* Specification. */
23 #include "x-csharp.h"
24
25 #include <errno.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30
31 #include "message.h"
32 #include "rc-str-list.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-encoding.h"
36 #include "xg-mixed-string.h"
37 #include "xg-arglist-context.h"
38 #include "xg-arglist-callshape.h"
39 #include "xg-arglist-parser.h"
40 #include "xg-message.h"
41 #include "c-ctype.h"
42 #include "error.h"
43 #include "error-progname.h"
44 #include "xalloc.h"
45 #include "xerror.h"
46 #include "xvasprintf.h"
47 #include "mem-hash-map.h"
48 #include "po-charset.h"
49 #include "unistr.h"
50 #include "gettext.h"
51
52 #define _(s) gettext(s)
53
54 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
55
56
57 /* The C# syntax is defined in ECMA-334, second edition. */
58
59
60 /* ====================== Keyword set customization. ====================== */
61
62 /* If true extract all strings. */
63 static bool extract_all = false;
64
65 static hash_table keywords;
66 static bool default_keywords = true;
67
68
69 void
x_csharp_extract_all()70 x_csharp_extract_all ()
71 {
72 extract_all = true;
73 }
74
75
76 /* Processes a --keyword option.
77 Non-ASCII function names can be used if given in UTF-8 encoding. */
78 void
x_csharp_keyword(const char * name)79 x_csharp_keyword (const char *name)
80 {
81 if (name == NULL)
82 default_keywords = false;
83 else
84 {
85 const char *end;
86 struct callshape shape;
87 const char *colon;
88
89 if (keywords.table == NULL)
90 hash_init (&keywords, 100);
91
92 split_keywordspec (name, &end, &shape);
93
94 /* The characters between name and end should form a valid C#
95 identifier sequence with dots.
96 A colon means an invalid parse in split_keywordspec(). */
97 colon = strchr (name, ':');
98 if (colon == NULL || colon >= end)
99 insert_keyword_callshape (&keywords, name, end - name, &shape);
100 }
101 }
102
103 /* Finish initializing the keywords hash table.
104 Called after argument processing, before each file is processed. */
105 static void
init_keywords()106 init_keywords ()
107 {
108 if (default_keywords)
109 {
110 /* When adding new keywords here, also update the documentation in
111 xgettext.texi! */
112 x_csharp_keyword ("GetString"); /* Resource{Manager,Set}.GetString */
113 x_csharp_keyword ("GetPluralString:1,2"); /* GettextResource{Manager,Set}.GetPluralString */
114 x_csharp_keyword ("GetParticularString:1c,2"); /* Resource{Manager,Set}.GetParticularString */
115 x_csharp_keyword ("GetParticularPluralString:1c,2,3"); /* Resource{Manager,Set}.GetParticularPluralString */
116 default_keywords = false;
117 }
118 }
119
120 void
init_flag_table_csharp()121 init_flag_table_csharp ()
122 {
123 xgettext_record_flag ("GetString:1:pass-csharp-format");
124 xgettext_record_flag ("GetPluralString:1:pass-csharp-format");
125 xgettext_record_flag ("GetPluralString:2:pass-csharp-format");
126 xgettext_record_flag ("GetParticularString:2:pass-csharp-format");
127 xgettext_record_flag ("GetParticularPluralString:2:pass-csharp-format");
128 xgettext_record_flag ("GetParticularPluralString:3:pass-csharp-format");
129 xgettext_record_flag ("String.Format:1:csharp-format");
130 }
131
132
133 /* ======================== Reading of characters. ======================== */
134
135 /* The input file stream. */
136 static FILE *fp;
137
138
139 /* Phase 1: line_number handling. */
140
141 /* Maximum used, roughly a safer MB_LEN_MAX. */
142 #define MAX_PHASE1_PUSHBACK 16
143 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
144 static int phase1_pushback_length;
145
146 /* Read the next single byte from the input file. */
147 static int
phase1_getc()148 phase1_getc ()
149 {
150 int c;
151
152 if (phase1_pushback_length)
153 {
154 c = phase1_pushback[--phase1_pushback_length];
155 if (c == '\n')
156 ++line_number;
157 return c;
158 }
159
160 c = getc (fp);
161 if (c == EOF)
162 {
163 if (ferror (fp))
164 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
165 real_file_name);
166 return EOF;
167 }
168
169 if (c == '\n')
170 ++line_number;
171 return c;
172 }
173
174 /* Supports MAX_PHASE1_PUSHBACK characters of pushback. */
175 static void
phase1_ungetc(int c)176 phase1_ungetc (int c)
177 {
178 if (c != EOF)
179 {
180 if (c == '\n')
181 --line_number;
182 if (phase1_pushback_length == SIZEOF (phase1_pushback))
183 abort ();
184 phase1_pushback[phase1_pushback_length++] = c;
185 }
186 }
187
188
189 /* Phase 2: Conversion to Unicode.
190 This is done early because ECMA-334 section 9.1. says that the source is
191 "an ordered sequence of Unicode characters", and because the recognition
192 of the line terminators (ECMA-334 section 9.3.1) is hardly possible without
193 prior conversion to Unicode. */
194
195 /* End-of-file indicator for functions returning an UCS-4 character. */
196 #define UEOF -1
197
198 /* Newline Unicode character. */
199 #define UNL 0x000a
200
201 static lexical_context_ty lexical_context;
202
203 static int phase2_pushback[1];
204 static int phase2_pushback_length;
205
206 /* Read the next Unicode UCS-4 character from the input file. */
207 static int
phase2_getc()208 phase2_getc ()
209 {
210 if (phase2_pushback_length)
211 return phase2_pushback[--phase2_pushback_length];
212
213 if (xgettext_current_source_encoding == po_charset_ascii)
214 {
215 int c = phase1_getc ();
216 if (c == EOF)
217 return UEOF;
218 if (!c_isascii (c))
219 {
220 multiline_error (xstrdup (""),
221 xasprintf ("%s\n%s\n",
222 non_ascii_error_message (lexical_context,
223 real_file_name,
224 line_number),
225 _("Please specify the source encoding through --from-code.")));
226 exit (EXIT_FAILURE);
227 }
228 return c;
229 }
230 else if (xgettext_current_source_encoding != po_charset_utf8)
231 {
232 #if HAVE_ICONV
233 /* Use iconv on an increasing number of bytes. Read only as many bytes
234 through phase1_getc as needed. This is needed to give reasonable
235 interactive behaviour when fp is connected to an interactive tty. */
236 unsigned char buf[MAX_PHASE1_PUSHBACK];
237 size_t bufcount;
238 int c = phase1_getc ();
239 if (c == EOF)
240 return UEOF;
241 buf[0] = (unsigned char) c;
242 bufcount = 1;
243
244 for (;;)
245 {
246 unsigned char scratchbuf[6];
247 const char *inptr = (const char *) &buf[0];
248 size_t insize = bufcount;
249 char *outptr = (char *) &scratchbuf[0];
250 size_t outsize = sizeof (scratchbuf);
251
252 size_t res = iconv (xgettext_current_source_iconv,
253 (ICONV_CONST char **) &inptr, &insize,
254 &outptr, &outsize);
255 /* We expect that a character has been produced if and only if
256 some input bytes have been consumed. */
257 if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
258 abort ();
259 if (outsize == sizeof (scratchbuf))
260 {
261 /* No character has been produced. Must be an error. */
262 if (res != (size_t)(-1))
263 abort ();
264
265 if (errno == EILSEQ)
266 {
267 /* An invalid multibyte sequence was encountered. */
268 multiline_error (xstrdup (""),
269 xasprintf (_("\
270 %s:%d: Invalid multibyte sequence.\n\
271 Please specify the correct source encoding through --from-code.\n"),
272 real_file_name, line_number));
273 exit (EXIT_FAILURE);
274 }
275 else if (errno == EINVAL)
276 {
277 /* An incomplete multibyte character. */
278 int c;
279
280 if (bufcount == MAX_PHASE1_PUSHBACK)
281 {
282 /* An overlong incomplete multibyte sequence was
283 encountered. */
284 multiline_error (xstrdup (""),
285 xasprintf (_("\
286 %s:%d: Long incomplete multibyte sequence.\n\
287 Please specify the correct source encoding through --from-code.\n"),
288 real_file_name, line_number));
289 exit (EXIT_FAILURE);
290 }
291
292 /* Read one more byte and retry iconv. */
293 c = phase1_getc ();
294 if (c == EOF)
295 {
296 multiline_error (xstrdup (""),
297 xasprintf (_("\
298 %s:%d: Incomplete multibyte sequence at end of file.\n\
299 Please specify the correct source encoding through --from-code.\n"),
300 real_file_name, line_number));
301 exit (EXIT_FAILURE);
302 }
303 if (c == '\n')
304 {
305 multiline_error (xstrdup (""),
306 xasprintf (_("\
307 %s:%d: Incomplete multibyte sequence at end of line.\n\
308 Please specify the correct source encoding through --from-code.\n"),
309 real_file_name, line_number - 1));
310 exit (EXIT_FAILURE);
311 }
312 buf[bufcount++] = (unsigned char) c;
313 }
314 else
315 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
316 real_file_name, line_number);
317 }
318 else
319 {
320 size_t outbytes = sizeof (scratchbuf) - outsize;
321 size_t bytes = bufcount - insize;
322 ucs4_t uc;
323
324 /* We expect that one character has been produced. */
325 if (bytes == 0)
326 abort ();
327 if (outbytes == 0)
328 abort ();
329 /* Push back the unused bytes. */
330 while (insize > 0)
331 phase1_ungetc (buf[--insize]);
332 /* Convert the character from UTF-8 to UCS-4. */
333 if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
334 {
335 /* scratchbuf contains an out-of-range Unicode character
336 (> 0x10ffff). */
337 multiline_error (xstrdup (""),
338 xasprintf (_("\
339 %s:%d: Invalid multibyte sequence.\n\
340 Please specify the source encoding through --from-code.\n"),
341 real_file_name, line_number));
342 exit (EXIT_FAILURE);
343 }
344 return uc;
345 }
346 }
347 #else
348 /* If we don't have iconv(), the only supported values for
349 xgettext_global_source_encoding and thus also for
350 xgettext_current_source_encoding are ASCII and UTF-8. */
351 abort ();
352 #endif
353 }
354 else
355 {
356 /* Read an UTF-8 encoded character. */
357 unsigned char buf[6];
358 unsigned int count;
359 int c;
360 ucs4_t uc;
361
362 c = phase1_getc ();
363 if (c == EOF)
364 return UEOF;
365 buf[0] = c;
366 count = 1;
367
368 if (buf[0] >= 0xc0)
369 {
370 c = phase1_getc ();
371 if (c == EOF)
372 return UEOF;
373 buf[1] = c;
374 count = 2;
375 }
376
377 if (buf[0] >= 0xe0
378 && ((buf[1] ^ 0x80) < 0x40))
379 {
380 c = phase1_getc ();
381 if (c == EOF)
382 return UEOF;
383 buf[2] = c;
384 count = 3;
385 }
386
387 if (buf[0] >= 0xf0
388 && ((buf[1] ^ 0x80) < 0x40)
389 && ((buf[2] ^ 0x80) < 0x40))
390 {
391 c = phase1_getc ();
392 if (c == EOF)
393 return UEOF;
394 buf[3] = c;
395 count = 4;
396 }
397
398 if (buf[0] >= 0xf8
399 && ((buf[1] ^ 0x80) < 0x40)
400 && ((buf[2] ^ 0x80) < 0x40)
401 && ((buf[3] ^ 0x80) < 0x40))
402 {
403 c = phase1_getc ();
404 if (c == EOF)
405 return UEOF;
406 buf[4] = c;
407 count = 5;
408 }
409
410 if (buf[0] >= 0xfc
411 && ((buf[1] ^ 0x80) < 0x40)
412 && ((buf[2] ^ 0x80) < 0x40)
413 && ((buf[3] ^ 0x80) < 0x40)
414 && ((buf[4] ^ 0x80) < 0x40))
415 {
416 c = phase1_getc ();
417 if (c == EOF)
418 return UEOF;
419 buf[5] = c;
420 count = 6;
421 }
422
423 u8_mbtouc (&uc, buf, count);
424 return uc;
425 }
426 }
427
428 /* Supports only one pushback character. */
429 static void
phase2_ungetc(int c)430 phase2_ungetc (int c)
431 {
432 if (c != UEOF)
433 {
434 if (phase2_pushback_length == SIZEOF (phase2_pushback))
435 abort ();
436 phase2_pushback[phase2_pushback_length++] = c;
437 }
438 }
439
440
441 /* Phase 3: Convert all line terminators to LF.
442 See ECMA-334 section 9.3.1. */
443
444 /* Line number defined in terms of phase3. */
445 static int logical_line_number;
446
447 static int phase3_pushback[9];
448 static int phase3_pushback_length;
449
450 /* Read the next Unicode UCS-4 character from the input file, mapping
451 all line terminators to U+000A, and dropping U+001A at the end of file. */
452 static int
phase3_getc()453 phase3_getc ()
454 {
455 int c;
456
457 if (phase3_pushback_length)
458 {
459 c = phase3_pushback[--phase3_pushback_length];
460 if (c == UNL)
461 ++logical_line_number;
462 return c;
463 }
464
465 c = phase2_getc ();
466
467 if (c == 0x000d)
468 {
469 int c1 = phase2_getc ();
470
471 if (c1 != UEOF && c1 != 0x000a)
472 phase2_ungetc (c1);
473
474 /* Seen line terminator CR or CR/LF. */
475 ++logical_line_number;
476 return UNL;
477 }
478
479 if (c == 0x0085 || c == 0x2028 || c == 0x2029)
480 {
481 /* Seen Unicode word processor newline. */
482 ++logical_line_number;
483 return UNL;
484 }
485
486 if (c == 0x001a)
487 {
488 int c1 = phase2_getc ();
489
490 if (c1 == UEOF)
491 /* Seen U+001A right before the end of file. */
492 return UEOF;
493
494 phase2_ungetc (c1);
495 }
496
497 if (c == UNL)
498 ++logical_line_number;
499 return c;
500 }
501
502 /* Supports 9 characters of pushback. */
503 static void
phase3_ungetc(int c)504 phase3_ungetc (int c)
505 {
506 if (c != UEOF)
507 {
508 if (c == UNL)
509 --logical_line_number;
510 if (phase3_pushback_length == SIZEOF (phase3_pushback))
511 abort ();
512 phase3_pushback[phase3_pushback_length++] = c;
513 }
514 }
515
516
517 /* ========================= Accumulating strings. ======================== */
518
519 /* See xg-mixed-string.h for the API.
520 In this extractor, we add only Unicode characters. */
521
522
523 /* ======================== Accumulating comments. ======================== */
524
525
526 /* Accumulating a single comment line. */
527
528 static struct mixed_string_buffer comment_buffer;
529
530 static inline void
comment_start()531 comment_start ()
532 {
533 mixed_string_buffer_init (&comment_buffer, lc_comment,
534 logical_file_name, line_number);
535 }
536
537 static inline bool
comment_at_start()538 comment_at_start ()
539 {
540 return mixed_string_buffer_is_empty (&comment_buffer);
541 }
542
543 static inline void
comment_add(int c)544 comment_add (int c)
545 {
546 mixed_string_buffer_append_unicode (&comment_buffer, c);
547 }
548
549 static inline void
comment_line_end(size_t chars_to_remove)550 comment_line_end (size_t chars_to_remove)
551 {
552 char *buffer =
553 mixed_string_contents_free1 (mixed_string_buffer_result (&comment_buffer));
554 size_t buflen = strlen (buffer);
555
556 buflen -= chars_to_remove;
557 while (buflen >= 1
558 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
559 --buflen;
560 buffer[buflen] = '\0';
561 savable_comment_add (buffer);
562 lexical_context = lc_outside;
563 }
564
565
566 /* These are for tracking whether comments count as immediately before
567 keyword. */
568 static int last_comment_line;
569 static int last_non_comment_line;
570
571
572 /* Phase 4: Replace each comment that is not inside a character constant or
573 string literal with a space or newline character.
574 See ECMA-334 section 9.3.2. */
575
576 static int
phase4_getc()577 phase4_getc ()
578 {
579 int c0;
580 int c;
581 bool last_was_star;
582
583 c0 = phase3_getc ();
584 if (c0 != '/')
585 return c0;
586 c = phase3_getc ();
587 switch (c)
588 {
589 default:
590 phase3_ungetc (c);
591 return c0;
592
593 case '*':
594 /* C style comment. */
595 comment_start ();
596 last_was_star = false;
597 for (;;)
598 {
599 c = phase3_getc ();
600 if (c == UEOF)
601 break;
602 /* We skip all leading white space, but not EOLs. */
603 if (!(comment_at_start () && (c == ' ' || c == '\t')))
604 comment_add (c);
605 switch (c)
606 {
607 case UNL:
608 comment_line_end (1);
609 comment_start ();
610 last_was_star = false;
611 continue;
612
613 case '*':
614 last_was_star = true;
615 continue;
616
617 case '/':
618 if (last_was_star)
619 {
620 comment_line_end (2);
621 break;
622 }
623 /* FALLTHROUGH */
624
625 default:
626 last_was_star = false;
627 continue;
628 }
629 break;
630 }
631 last_comment_line = logical_line_number;
632 return ' ';
633
634 case '/':
635 /* C++ style comment. */
636 last_comment_line = logical_line_number;
637 comment_start ();
638 for (;;)
639 {
640 c = phase3_getc ();
641 if (c == UNL || c == UEOF)
642 break;
643 /* We skip all leading white space, but not EOLs. */
644 if (!(comment_at_start () && (c == ' ' || c == '\t')))
645 comment_add (c);
646 }
647 phase3_ungetc (c); /* push back the newline, to decrement logical_line_number */
648 comment_line_end (0);
649 phase3_getc (); /* read the newline again */
650 return UNL;
651 }
652 }
653
654 /* Supports only one pushback character. */
655 static void
phase4_ungetc(int c)656 phase4_ungetc (int c)
657 {
658 phase3_ungetc (c);
659 }
660
661
662 /* ======================= Character classification. ====================== */
663
664
665 /* Return true if a given character is white space.
666 See ECMA-334 section 9.3.3. */
667 static bool
is_whitespace(int c)668 is_whitespace (int c)
669 {
670 /* Unicode character class Zs, as of Unicode 4.0. */
671 /* grep '^[^;]*;[^;]*;Zs;' UnicodeData-4.0.0.txt */
672 switch (c >> 8)
673 {
674 case 0x00:
675 return (c == 0x0020 || c == 0x00a0);
676 case 0x16:
677 return (c == 0x1680);
678 case 0x18:
679 return (c == 0x180e);
680 case 0x20:
681 return ((c >= 0x2000 && c <= 0x200b) || c == 0x202f || c == 0x205f);
682 case 0x30:
683 return (c == 0x3000);
684 default:
685 return false;
686 }
687 }
688
689
690 /* C# allows identifiers containing many Unicode characters. We recognize
691 them; to use an identifier with Unicode characters in a --keyword option,
692 it must be specified in UTF-8. */
693
694 static inline int
bitmap_lookup(const void * table,unsigned int uc)695 bitmap_lookup (const void *table, unsigned int uc)
696 {
697 unsigned int index1 = uc >> 16;
698 if (index1 < ((const int *) table)[0])
699 {
700 int lookup1 = ((const int *) table)[1 + index1];
701 if (lookup1 >= 0)
702 {
703 unsigned int index2 = (uc >> 9) & 0x7f;
704 int lookup2 = ((const int *) table)[lookup1 + index2];
705 if (lookup2 >= 0)
706 {
707 unsigned int index3 = (uc >> 5) & 0xf;
708 unsigned int lookup3 = ((const int *) table)[lookup2 + index3];
709
710 return (lookup3 >> (uc & 0x1f)) & 1;
711 }
712 }
713 }
714 return 0;
715 }
716
717 /* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, as of Unicode 4.0,
718 plus the underscore. */
719 static const
720 struct
721 {
722 int header[1];
723 int level1[3];
724 int level2[3 << 7];
725 /*unsigned*/ int level3[34 << 4];
726 }
727 table_identifier_start =
728 {
729 { 3 },
730 { 4, 132, 260 },
731 {
732 388, 404, 420, 436, 452, 468, 484, 500,
733 516, 532, 548, 564, 580, -1, 596, 612,
734 628, -1, -1, -1, -1, -1, -1, -1,
735 644, -1, 660, 660, 660, 660, 660, 660,
736 660, 660, 660, 660, 660, 660, 676, 660,
737 660, 660, 660, 660, 660, 660, 660, 660,
738 660, 660, 660, 660, 660, 660, 660, 660,
739 660, 660, 660, 660, 660, 660, 660, 660,
740 660, 660, 660, 660, 660, 660, 660, 660,
741 660, 660, 660, 660, 660, 660, 660, 692,
742 660, 660, 708, -1, -1, -1, 660, 660,
743 660, 660, 660, 660, 660, 660, 660, 660,
744 660, 660, 660, 660, 660, 660, 660, 660,
745 660, 660, 660, 724, -1, -1, -1, -1,
746 -1, -1, -1, -1, -1, -1, -1, -1,
747 -1, -1, -1, -1, 740, 756, 772, 788,
748 804, 820, 836, -1, 852, -1, -1, -1,
749 -1, -1, -1, -1, -1, -1, -1, -1,
750 -1, -1, -1, -1, -1, -1, -1, -1,
751 -1, -1, -1, -1, -1, -1, -1, -1,
752 -1, -1, -1, -1, -1, -1, -1, -1,
753 -1, -1, -1, -1, -1, -1, -1, -1,
754 -1, -1, -1, -1, -1, -1, -1, -1,
755 -1, -1, -1, -1, -1, -1, -1, -1,
756 -1, -1, -1, -1, -1, -1, -1, -1,
757 -1, -1, -1, -1, -1, -1, -1, -1,
758 -1, -1, -1, -1, -1, -1, -1, -1,
759 -1, -1, -1, -1, -1, -1, -1, -1,
760 -1, -1, -1, -1, -1, -1, -1, -1,
761 -1, -1, 868, 884, -1, -1, -1, -1,
762 -1, -1, -1, -1, -1, -1, -1, -1,
763 -1, -1, -1, -1, -1, -1, -1, -1,
764 660, 660, 660, 660, 660, 660, 660, 660,
765 660, 660, 660, 660, 660, 660, 660, 660,
766 660, 660, 660, 660, 660, 660, 660, 660,
767 660, 660, 660, 660, 660, 660, 660, 660,
768 660, 660, 660, 660, 660, 660, 660, 660,
769 660, 660, 660, 660, 660, 660, 660, 660,
770 660, 660, 660, 660, 660, 660, 660, 660,
771 660, 660, 660, 660, 660, 660, 660, 660,
772 660, 660, 660, 660, 660, 660, 660, 660,
773 660, 660, 660, 660, 660, 660, 660, 660,
774 660, 660, 660, 900, -1, -1, -1, -1,
775 -1, -1, -1, -1, -1, -1, -1, -1,
776 -1, -1, -1, -1, -1, -1, -1, -1,
777 -1, -1, -1, -1, -1, -1, -1, -1,
778 -1, -1, -1, -1, -1, -1, -1, -1,
779 -1, -1, -1, -1, 660, 916, -1, -1
780 },
781 {
782 0x00000000, 0x00000000, 0x87FFFFFE, 0x07FFFFFE,
783 0x00000000, 0x04200400, 0xFF7FFFFF, 0xFF7FFFFF,
784 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
785 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
786 0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
787 0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
788 0x00000000, 0x00000000, 0x00000000, 0x04000000,
789 0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
790 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
791 0xFFFFFC03, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
792 0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
793 0x000000FF, 0x00000000, 0xFFFF0000, 0x000707FF,
794 0x00000000, 0x07FFFFFE, 0x000007FF, 0xFFFEC000,
795 0xFFFFFFFF, 0xFFFFFFFF, 0x002FFFFF, 0x9C00C060,
796 0xFFFD0000, 0x0000FFFF, 0x0000E000, 0x00000000,
797 0xFFFFFFFF, 0x0002003F, 0x00000000, 0x00000000,
798 0x00000000, 0x00000000, 0x00000000, 0x00000000,
799 0x00000000, 0x00000000, 0x00000000, 0x00000000,
800 0xFFFFFFF0, 0x23FFFFFF, 0xFF010000, 0x00000003,
801 0xFFF99FE0, 0x23C5FDFF, 0xB0000000, 0x00030003,
802 0xFFF987E0, 0x036DFDFF, 0x5E000000, 0x001C0000,
803 0xFFFBBFE0, 0x23EDFDFF, 0x00010000, 0x00000003,
804 0xFFF99FE0, 0x23EDFDFF, 0xB0000000, 0x00020003,
805 0xD63DC7E8, 0x03BFC718, 0x00000000, 0x00000000,
806 0xFFFDDFE0, 0x03EFFDFF, 0x00000000, 0x00000003,
807 0xFFFDDFE0, 0x23EFFDFF, 0x40000000, 0x00000003,
808 0xFFFDDFE0, 0x03FFFDFF, 0x00000000, 0x00000003,
809 0xFC7FFFE0, 0x2FFBFFFF, 0x0000007F, 0x00000000,
810 0xFFFFFFFE, 0x000DFFFF, 0x0000007F, 0x00000000,
811 0xFEF02596, 0x200DECAE, 0x3000005F, 0x00000000,
812 0x00000001, 0x00000000, 0xFFFFFEFF, 0x000007FF,
813 0x00000F00, 0x00000000, 0x00000000, 0x00000000,
814 0xFFFFFFFF, 0x000006FB, 0x003F0000, 0x00000000,
815 0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
816 0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
817 0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
818 0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
819 0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
820 0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x00000000,
821 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
822 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
823 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
824 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
825 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
826 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
827 0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
828 0x0003DFFF, 0x0003FFFF, 0x0003FFFF, 0x0001DFFF,
829 0xFFFFFFFF, 0x000FFFFF, 0x10800000, 0x00000000,
830 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
831 0xFFFFFFFF, 0x000001FF, 0x00000000, 0x00000000,
832 0x1FFFFFFF, 0x00000000, 0xFFFF0000, 0x001F3FFF,
833 0x00000000, 0x00000000, 0x00000000, 0x00000000,
834 0x00000000, 0x00000000, 0x00000000, 0x00000000,
835 0x00000000, 0x00000000, 0x00000000, 0x00000000,
836 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
837 0x00000000, 0x00000000, 0x00000000, 0x00000000,
838 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
839 0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
840 0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
841 0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
842 0x00000000, 0x00000000, 0x00000000, 0x80020000,
843 0x00000000, 0x00000000, 0x00000000, 0x00000000,
844 0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
845 0x0000000F, 0x00000000, 0x00000000, 0x00000000,
846 0x000000E0, 0x1F3E03FE, 0xFFFFFFFE, 0xFFFFFFFF,
847 0xE07FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xF7FFFFFF,
848 0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
849 0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
850 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
851 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
852 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
853 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
854 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
855 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
856 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
857 0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
858 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
859 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
860 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
861 0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
862 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
863 0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
864 0x00000000, 0x00000000, 0x00000000, 0x00000000,
865 0x00000000, 0x00000000, 0x00000000, 0x00000000,
866 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
867 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
868 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
869 0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
870 0x00000000, 0x00000000, 0x00000000, 0x00000000,
871 0x00000000, 0x00000000, 0x00000000, 0x00000000,
872 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
873 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
874 0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
875 0x00000000, 0x00000000, 0x00000000, 0x00000000,
876 0xA0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
877 0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
878 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
879 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
880 0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
881 0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
882 0x00000000, 0x00000000, 0x00000000, 0xFFDF0000,
883 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x1FFFFFFF,
884 0x00000000, 0x07FFFFFE, 0x07FFFFFE, 0xFFFFFFC0,
885 0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x00000000,
886 0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
887 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
888 0x00000000, 0x00000000, 0x00000000, 0x00000000,
889 0x00000000, 0x00000000, 0x00000000, 0x00000000,
890 0x00000000, 0x00000000, 0x00000000, 0x00000000,
891 0x00000000, 0x00000000, 0x00000000, 0x00000000,
892 0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
893 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
894 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
895 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
896 0x00000000, 0x00000000, 0x00000000, 0x00000000,
897 0x00000000, 0x00000000, 0x00000000, 0x00000000,
898 0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
899 0x00000000, 0x00000000, 0x00000000, 0x00000000,
900 0x00000000, 0x00000000, 0x00000000, 0x00000000,
901 0x00000000, 0x00000000, 0x00000000, 0x00000000,
902 0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
903 0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
904 0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
905 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
906 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
907 0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
908 0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
909 0xFFFFFDFF, 0xFFFFFDFF, 0x000003F7, 0x00000000,
910 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
911 0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
912 0x00000000, 0x00000000, 0x00000000, 0x00000000,
913 0x00000000, 0x00000000, 0x00000000, 0x00000000,
914 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
915 0x00000000, 0x00000000, 0x00000000, 0x00000000,
916 0x00000000, 0x00000000, 0x00000000, 0x00000000,
917 0x00000000, 0x00000000, 0x00000000, 0x00000000
918 }
919 };
920
921 /* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, Nd, Pc, Mn, Mc, Cf,
922 as of Unicode 4.0. */
923 static const
924 struct
925 {
926 int header[1];
927 int level1[15];
928 int level2[4 << 7];
929 /*unsigned*/ int level3[36 << 4];
930 }
931 table_identifier_part =
932 {
933 { 15 },
934 {
935 16, 144, 272, -1, -1, -1, -1, -1,
936 -1, -1, -1, -1, -1, -1, 400
937 },
938 {
939 528, 544, 560, 576, 592, 608, 624, 640,
940 656, 672, 688, 704, 720, -1, 736, 752,
941 768, -1, -1, -1, -1, -1, -1, -1,
942 784, -1, 800, 800, 800, 800, 800, 800,
943 800, 800, 800, 800, 800, 800, 816, 800,
944 800, 800, 800, 800, 800, 800, 800, 800,
945 800, 800, 800, 800, 800, 800, 800, 800,
946 800, 800, 800, 800, 800, 800, 800, 800,
947 800, 800, 800, 800, 800, 800, 800, 800,
948 800, 800, 800, 800, 800, 800, 800, 832,
949 800, 800, 848, -1, -1, -1, 800, 800,
950 800, 800, 800, 800, 800, 800, 800, 800,
951 800, 800, 800, 800, 800, 800, 800, 800,
952 800, 800, 800, 864, -1, -1, -1, -1,
953 -1, -1, -1, -1, -1, -1, -1, -1,
954 -1, -1, -1, -1, 880, 896, 912, 928,
955 944, 960, 976, -1, 992, -1, -1, -1,
956 -1, -1, -1, -1, -1, -1, -1, -1,
957 -1, -1, -1, -1, -1, -1, -1, -1,
958 -1, -1, -1, -1, -1, -1, -1, -1,
959 -1, -1, -1, -1, -1, -1, -1, -1,
960 -1, -1, -1, -1, -1, -1, -1, -1,
961 -1, -1, -1, -1, -1, -1, -1, -1,
962 -1, -1, -1, -1, -1, -1, -1, -1,
963 -1, -1, -1, -1, -1, -1, -1, -1,
964 -1, -1, -1, -1, -1, -1, -1, -1,
965 -1, -1, -1, -1, -1, -1, -1, -1,
966 -1, -1, -1, -1, -1, -1, -1, -1,
967 -1, -1, -1, -1, -1, -1, -1, -1,
968 1008, -1, 1024, 1040, -1, -1, -1, -1,
969 -1, -1, -1, -1, -1, -1, -1, -1,
970 -1, -1, -1, -1, -1, -1, -1, -1,
971 800, 800, 800, 800, 800, 800, 800, 800,
972 800, 800, 800, 800, 800, 800, 800, 800,
973 800, 800, 800, 800, 800, 800, 800, 800,
974 800, 800, 800, 800, 800, 800, 800, 800,
975 800, 800, 800, 800, 800, 800, 800, 800,
976 800, 800, 800, 800, 800, 800, 800, 800,
977 800, 800, 800, 800, 800, 800, 800, 800,
978 800, 800, 800, 800, 800, 800, 800, 800,
979 800, 800, 800, 800, 800, 800, 800, 800,
980 800, 800, 800, 800, 800, 800, 800, 800,
981 800, 800, 800, 1056, -1, -1, -1, -1,
982 -1, -1, -1, -1, -1, -1, -1, -1,
983 -1, -1, -1, -1, -1, -1, -1, -1,
984 -1, -1, -1, -1, -1, -1, -1, -1,
985 -1, -1, -1, -1, -1, -1, -1, -1,
986 -1, -1, -1, -1, 800, 1072, -1, -1,
987 1088, -1, -1, -1, -1, -1, -1, -1,
988 -1, -1, -1, -1, -1, -1, -1, -1,
989 -1, -1, -1, -1, -1, -1, -1, -1,
990 -1, -1, -1, -1, -1, -1, -1, -1,
991 -1, -1, -1, -1, -1, -1, -1, -1,
992 -1, -1, -1, -1, -1, -1, -1, -1,
993 -1, -1, -1, -1, -1, -1, -1, -1,
994 -1, -1, -1, -1, -1, -1, -1, -1,
995 -1, -1, -1, -1, -1, -1, -1, -1,
996 -1, -1, -1, -1, -1, -1, -1, -1,
997 -1, -1, -1, -1, -1, -1, -1, -1,
998 -1, -1, -1, -1, -1, -1, -1, -1,
999 -1, -1, -1, -1, -1, -1, -1, -1,
1000 -1, -1, -1, -1, -1, -1, -1, -1,
1001 -1, -1, -1, -1, -1, -1, -1, -1,
1002 -1, -1, -1, -1, -1, -1, -1, -1
1003 },
1004 {
1005 0x00000000, 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE,
1006 0x00000000, 0x04202400, 0xFF7FFFFF, 0xFF7FFFFF,
1007 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1008 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1009 0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
1010 0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
1011 0xFFFFFFFF, 0xFFFFFFFF, 0xE0FFFFFF, 0x0400FFFF,
1012 0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
1013 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1014 0xFFFFFC7B, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
1015 0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
1016 0xFFFE00FF, 0xBBFFFFFB, 0xFFFF0016, 0x000707FF,
1017 0x003F000F, 0x07FFFFFE, 0x01FFFFFF, 0xFFFFC3FF,
1018 0xFFFFFFFF, 0xFFFFFFFF, 0xBFEFFFFF, 0x9FFFFDFF,
1019 0xFFFF8000, 0xFFFFFFFF, 0x0000E7FF, 0x00000000,
1020 0xFFFFFFFF, 0x0003FFFF, 0x00000000, 0x00000000,
1021 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1022 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1023 0xFFFFFFFE, 0xF3FFFFFF, 0xFF1F3FFF, 0x0000FFCF,
1024 0xFFF99FEE, 0xF3C5FDFF, 0xB080399F, 0x0003FFCF,
1025 0xFFF987EE, 0xD36DFDFF, 0x5E003987, 0x001FFFC0,
1026 0xFFFBBFEE, 0xF3EDFDFF, 0x00013BBF, 0x0000FFCF,
1027 0xFFF99FEE, 0xF3EDFDFF, 0xB0C0398F, 0x0002FFC3,
1028 0xD63DC7EC, 0xC3BFC718, 0x00803DC7, 0x0000FF80,
1029 0xFFFDDFEE, 0xC3EFFDFF, 0x00603DDF, 0x0000FFC3,
1030 0xFFFDDFEC, 0xF3EFFDFF, 0x40603DDF, 0x0000FFC3,
1031 0xFFFDDFEC, 0xC3FFFDFF, 0x00803DCF, 0x0000FFC3,
1032 0xFC7FFFEC, 0x2FFBFFFF, 0xFF5F847F, 0x000C0000,
1033 0xFFFFFFFE, 0x07FFFFFF, 0x03FF7FFF, 0x00000000,
1034 0xFEF02596, 0x3BFFECAE, 0x33FF3F5F, 0x00000000,
1035 0x03000001, 0xC2A003FF, 0xFFFFFEFF, 0xFFFE07FF,
1036 0xFEFF0FDF, 0x1FFFFFFF, 0x00000040, 0x00000000,
1037 0xFFFFFFFF, 0x03C7F6FB, 0x03FF03FF, 0x00000000,
1038 0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
1039 0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
1040 0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
1041 0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
1042 0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
1043 0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x0003FE00,
1044 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
1045 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1046 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1047 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1048 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1049 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
1050 0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
1051 0x001FDFFF, 0x001FFFFF, 0x000FFFFF, 0x000DDFFF,
1052 0xFFFFFFFF, 0xFFFFFFFF, 0x308FFFFF, 0x000003FF,
1053 0x03FF3800, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
1054 0xFFFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1055 0x1FFFFFFF, 0x0FFF0FFF, 0xFFFFFFC0, 0x001F3FFF,
1056 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1057 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1058 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1059 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
1060 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1061 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1062 0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
1063 0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
1064 0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
1065 0x0000F000, 0x80007C00, 0x00100001, 0x8002FC0F,
1066 0x00000000, 0x00000000, 0x1FFF0000, 0x000007E2,
1067 0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
1068 0x0000000F, 0x00000000, 0x00000000, 0x00000000,
1069 0x000000E0, 0x1F3EFFFE, 0xFFFFFFFE, 0xFFFFFFFF,
1070 0xE67FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF,
1071 0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
1072 0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
1073 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1074 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1075 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1076 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1077 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1078 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1079 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1080 0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
1081 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1082 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1083 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1084 0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
1085 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1086 0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
1087 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1088 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1089 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1090 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1091 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1092 0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
1093 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1094 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1095 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1096 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1097 0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
1098 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1099 0xE0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
1100 0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
1101 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1102 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1103 0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
1104 0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
1105 0x0000FFFF, 0x0018000F, 0x0000E000, 0xFFDF0000,
1106 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x9FFFFFFF,
1107 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE, 0xFFFFFFE0,
1108 0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x0E000000,
1109 0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
1110 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
1111 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1112 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1113 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1114 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1115 0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
1116 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1117 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1118 0x3FFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1119 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1120 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1121 0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
1122 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1123 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1124 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1125 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1126 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1127 0x00000000, 0x00000000, 0x00000000, 0xFFFFE3E0,
1128 0x00000FE7, 0x00003C00, 0x00000000, 0x00000000,
1129 0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
1130 0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
1131 0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
1132 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1133 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1134 0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
1135 0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
1136 0xFFFFFDFF, 0xFFFFFDFF, 0xFFFFC3F7, 0xFFFFFFFF,
1137 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1138 0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
1139 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1140 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1141 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1142 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1143 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1144 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1145 0x00000002, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1146 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1147 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1148 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF
1149 }
1150 };
1151
1152 /* Return true if a given character can occur as first character of an
1153 identifier. See ECMA-334 section 9.4.2. */
1154 static bool
is_identifier_start(int c)1155 is_identifier_start (int c)
1156 {
1157 return bitmap_lookup (&table_identifier_start, c);
1158 /* In ASCII only this would be:
1159 return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_');
1160 */
1161 }
1162
1163 /* Return true if a given character can occur as character of an identifier.
1164 See ECMA-334 section 9.4.2. */
1165 static bool
is_identifier_part(int c)1166 is_identifier_part (int c)
1167 {
1168 return bitmap_lookup (&table_identifier_part, c);
1169 /* In ASCII only this would be:
1170 return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1171 || (c >= '0' && c <= '9') || c == '_');
1172 */
1173 }
1174
1175 static bool
is_any_character(int c)1176 is_any_character (int c)
1177 {
1178 return true;
1179 }
1180
1181
1182 /* ======================= Preprocessor directives. ======================= */
1183
1184
1185 /* Phase 5: Remove preprocessor lines. See ECMA-334 section 9.5.
1186 As a side effect, this also removes initial whitespace on every line;
1187 this whitespace doesn't matter. */
1188
1189 static int phase5_pushback[10];
1190 static int phase5_pushback_length;
1191
1192 static int
phase5_getc()1193 phase5_getc ()
1194 {
1195 int c;
1196
1197 if (phase5_pushback_length)
1198 return phase5_pushback[--phase5_pushback_length];
1199
1200 c = phase4_getc ();
1201 if (c != UNL)
1202 return c;
1203
1204 do
1205 c = phase3_getc ();
1206 while (c != UEOF && is_whitespace (c));
1207
1208 if (c == '#')
1209 {
1210 /* Ignore the entire line containing the preprocessor directive
1211 (including the // comment if it contains one). */
1212 do
1213 c = phase3_getc ();
1214 while (c != UEOF && c != UNL);
1215 return c;
1216 }
1217 else
1218 {
1219 phase3_ungetc (c);
1220 return UNL;
1221 }
1222 }
1223
1224 #ifdef unused
1225 static void
phase5_ungetc(int c)1226 phase5_ungetc (int c)
1227 {
1228 if (c != UEOF)
1229 {
1230 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1231 abort ();
1232 phase5_pushback[phase5_pushback_length++] = c;
1233 }
1234 }
1235 #endif
1236
1237
1238 /* ========================== Reading of tokens. ========================== */
1239
1240 enum token_type_ty
1241 {
1242 token_type_eof,
1243 token_type_lparen, /* ( */
1244 token_type_rparen, /* ) */
1245 token_type_lbrace, /* { */
1246 token_type_rbrace, /* } */
1247 token_type_comma, /* , */
1248 token_type_dot, /* . */
1249 token_type_string_literal, /* "abc", @"abc" */
1250 token_type_number, /* 1.23 */
1251 token_type_symbol, /* identifier, keyword, null */
1252 token_type_plus, /* + */
1253 token_type_other /* character literal, misc. operator */
1254 };
1255 typedef enum token_type_ty token_type_ty;
1256
1257 typedef struct token_ty token_ty;
1258 struct token_ty
1259 {
1260 token_type_ty type;
1261 char *string; /* for token_type_symbol */
1262 mixed_string_ty *mixed_string; /* for token_type_string_literal */
1263 refcounted_string_list_ty *comment; /* for token_type_string_literal */
1264 int line_number;
1265 int logical_line_number;
1266 };
1267
1268
1269 /* Free the memory pointed to by a 'struct token_ty'. */
1270 static inline void
free_token(token_ty * tp)1271 free_token (token_ty *tp)
1272 {
1273 if (tp->type == token_type_symbol)
1274 free (tp->string);
1275 if (tp->type == token_type_string_literal)
1276 {
1277 mixed_string_free (tp->mixed_string);
1278 drop_reference (tp->comment);
1279 }
1280 }
1281
1282
1283 /* Read a Unicode escape sequence outside string/character literals.
1284 Reject Unicode escapes that don't fulfill the given predicate.
1285 See ECMA-334 section 9.4.2. */
1286 static int
do_getc_unicode_escaped(bool (* predicate)(int))1287 do_getc_unicode_escaped (bool (*predicate) (int))
1288 {
1289 int c;
1290
1291 /* Use phase 3, because phase 4 elides comments. */
1292 c = phase3_getc ();
1293 if (c == UEOF)
1294 return '\\';
1295 if (c == 'u' || c == 'U')
1296 {
1297 unsigned char buf[8];
1298 int expect;
1299 unsigned int n;
1300 int i;
1301
1302 expect = (c == 'U' ? 8 : 4);
1303 n = 0;
1304 for (i = 0; i < expect; i++)
1305 {
1306 int c1 = phase3_getc ();
1307
1308 if (c1 >= '0' && c1 <= '9')
1309 n = (n << 4) + (c1 - '0');
1310 else if (c1 >= 'A' && c1 <= 'F')
1311 n = (n << 4) + (c1 - 'A' + 10);
1312 else if (c1 >= 'a' && c1 <= 'f')
1313 n = (n << 4) + (c1 - 'a' + 10);
1314 else
1315 {
1316 phase3_ungetc (c1);
1317 while (--i >= 0)
1318 phase3_ungetc (buf[i]);
1319 phase3_ungetc (c);
1320 return '\\';
1321 }
1322
1323 buf[i] = c1;
1324 }
1325
1326 if (n >= 0x110000)
1327 {
1328 error_with_progname = false;
1329 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1330 logical_file_name, line_number);
1331 error_with_progname = true;
1332 }
1333 else if (predicate (n))
1334 return n;
1335
1336 while (--i >= 0)
1337 phase3_ungetc (buf[i]);
1338 }
1339 phase3_ungetc (c);
1340 return '\\';
1341 }
1342
1343
1344 /* Read an escape sequence inside a string literal or character literal.
1345 See ECMA-334 sections 9.4.4.4., 9.4.4.5. */
1346 static int
do_getc_escaped()1347 do_getc_escaped ()
1348 {
1349 int c;
1350 int n;
1351 int i;
1352
1353 /* Use phase 3, because phase 4 elides comments. */
1354 c = phase3_getc ();
1355 if (c == UEOF)
1356 return '\\';
1357 switch (c)
1358 {
1359 case 'a':
1360 return 0x0007;
1361 case 'b':
1362 return 0x0008;
1363 case 't':
1364 return 0x0009;
1365 case 'n':
1366 return 0x000a;
1367 case 'v':
1368 return 0x000b;
1369 case 'f':
1370 return 0x000c;
1371 case 'r':
1372 return 0x000d;
1373 case '"':
1374 return '"';
1375 case '\'':
1376 return '\'';
1377 case '\\':
1378 return '\\';
1379 case '0':
1380 return 0x0000;
1381 case 'x':
1382 c = phase3_getc ();
1383 switch (c)
1384 {
1385 default:
1386 phase3_ungetc (c);
1387 phase3_ungetc ('x');
1388 return '\\';
1389
1390 case '0': case '1': case '2': case '3': case '4':
1391 case '5': case '6': case '7': case '8': case '9':
1392 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1393 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1394 break;
1395 }
1396 n = 0;
1397 for (i = 0;; i++)
1398 {
1399 switch (c)
1400 {
1401 default:
1402 phase3_ungetc (c);
1403 return n;
1404 case '0': case '1': case '2': case '3': case '4':
1405 case '5': case '6': case '7': case '8': case '9':
1406 n = n * 16 + c - '0';
1407 break;
1408 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1409 n = n * 16 + 10 + c - 'A';
1410 break;
1411 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1412 n = n * 16 + 10 + c - 'a';
1413 break;
1414 }
1415 if (i == 3)
1416 break;
1417 c = phase3_getc ();
1418 }
1419 return n;
1420 case 'u': case 'U':
1421 phase3_ungetc (c);
1422 return do_getc_unicode_escaped (is_any_character);
1423 default:
1424 /* Invalid escape sequence. */
1425 phase3_ungetc (c);
1426 return '\\';
1427 }
1428 }
1429
1430 /* Read a regular string literal or character literal.
1431 See ECMA-334 sections 9.4.4.4., 9.4.4.5. */
1432 static void
accumulate_escaped(struct mixed_string_buffer * literal,int delimiter)1433 accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
1434 {
1435 int c;
1436
1437 for (;;)
1438 {
1439 /* Use phase 3, because phase 4 elides comments. */
1440 c = phase3_getc ();
1441 if (c == UEOF || c == delimiter)
1442 break;
1443 if (c == UNL)
1444 {
1445 phase3_ungetc (c);
1446 error_with_progname = false;
1447 if (delimiter == '\'')
1448 error (0, 0, _("%s:%d: warning: unterminated character constant"),
1449 logical_file_name, line_number);
1450 else
1451 error (0, 0, _("%s:%d: warning: unterminated string constant"),
1452 logical_file_name, line_number);
1453 error_with_progname = true;
1454 break;
1455 }
1456 if (c == '\\')
1457 c = do_getc_escaped ();
1458 if (literal)
1459 mixed_string_buffer_append_unicode (literal, c);
1460 }
1461 }
1462
1463
1464 /* Combine characters into tokens. Discard whitespace. */
1465
1466 /* Maximum used guaranteed to be < 4. */
1467 static token_ty phase6_pushback[4];
1468 static int phase6_pushback_length;
1469
1470 static void
phase6_get(token_ty * tp)1471 phase6_get (token_ty *tp)
1472 {
1473 int c;
1474
1475 if (phase6_pushback_length)
1476 {
1477 *tp = phase6_pushback[--phase6_pushback_length];
1478 return;
1479 }
1480 tp->string = NULL;
1481
1482 for (;;)
1483 {
1484 tp->line_number = line_number;
1485 tp->logical_line_number = logical_line_number;
1486 c = phase5_getc ();
1487
1488 if (c == UEOF)
1489 {
1490 tp->type = token_type_eof;
1491 return;
1492 }
1493
1494 switch (c)
1495 {
1496 case UNL:
1497 if (last_non_comment_line > last_comment_line)
1498 savable_comment_reset ();
1499 /* FALLTHROUGH */
1500 case ' ':
1501 case '\t':
1502 case '\f':
1503 /* Ignore whitespace and comments. */
1504 continue;
1505 }
1506
1507 last_non_comment_line = tp->logical_line_number;
1508
1509 switch (c)
1510 {
1511 case '(':
1512 tp->type = token_type_lparen;
1513 return;
1514
1515 case ')':
1516 tp->type = token_type_rparen;
1517 return;
1518
1519 case '{':
1520 tp->type = token_type_lbrace;
1521 return;
1522
1523 case '}':
1524 tp->type = token_type_rbrace;
1525 return;
1526
1527 case ',':
1528 tp->type = token_type_comma;
1529 return;
1530
1531 case '.':
1532 c = phase4_getc ();
1533 if (!(c >= '0' && c <= '9'))
1534 {
1535 phase4_ungetc (c);
1536 tp->type = token_type_dot;
1537 return;
1538 }
1539 /* FALLTHROUGH */
1540
1541 case '0': case '1': case '2': case '3': case '4':
1542 case '5': case '6': case '7': case '8': case '9':
1543 {
1544 /* Don't need to verify the complicated syntax of integers and
1545 floating-point numbers. We assume a valid C# input.
1546 The simplified syntax that we recognize as number is: any
1547 sequence of alphanumeric characters, additionally '+' and '-'
1548 immediately after 'e' or 'E' except in hexadecimal numbers. */
1549 bool hexadecimal = false;
1550
1551 for (;;)
1552 {
1553 c = phase4_getc ();
1554 if (c >= '0' && c <= '9')
1555 continue;
1556 if ((c >= 'A' && c <= 'Z') || (c >= 'a' &&c <= 'z'))
1557 {
1558 if (c == 'X' || c == 'x')
1559 hexadecimal = true;
1560 if ((c == 'E' || c == 'e') && !hexadecimal)
1561 {
1562 c = phase4_getc ();
1563 if (!(c == '+' || c == '-'))
1564 phase4_ungetc (c);
1565 }
1566 continue;
1567 }
1568 if (c == '.')
1569 continue;
1570 break;
1571 }
1572 phase4_ungetc (c);
1573 tp->type = token_type_number;
1574 return;
1575 }
1576
1577 case '"':
1578 /* Regular string literal. */
1579 {
1580 struct mixed_string_buffer literal;
1581
1582 lexical_context = lc_string;
1583 mixed_string_buffer_init (&literal,
1584 lexical_context,
1585 logical_file_name,
1586 logical_line_number);
1587 accumulate_escaped (&literal, '"');
1588 tp->mixed_string = mixed_string_buffer_result (&literal);
1589 tp->comment = add_reference (savable_comment);
1590 lexical_context = lc_outside;
1591 tp->type = token_type_string_literal;
1592 return;
1593 }
1594
1595 case '\'':
1596 /* Character literal. */
1597 {
1598 accumulate_escaped (NULL, '\'');
1599 tp->type = token_type_other;
1600 return;
1601 }
1602
1603 case '+':
1604 c = phase4_getc ();
1605 if (c == '+')
1606 /* Operator ++ */
1607 tp->type = token_type_other;
1608 else if (c == '=')
1609 /* Operator += */
1610 tp->type = token_type_other;
1611 else
1612 {
1613 /* Operator + */
1614 phase4_ungetc (c);
1615 tp->type = token_type_plus;
1616 }
1617 return;
1618
1619 case '@':
1620 c = phase4_getc ();
1621 if (c == '"')
1622 {
1623 /* Verbatim string literal. */
1624 struct mixed_string_buffer literal;
1625
1626 lexical_context = lc_string;
1627 mixed_string_buffer_init (&literal, lexical_context,
1628 logical_file_name, logical_line_number);
1629 for (;;)
1630 {
1631 /* Use phase 2, because phase 4 elides comments and phase 3
1632 mixes up the newline characters. */
1633 c = phase2_getc ();
1634 if (c == UEOF)
1635 break;
1636 if (c == '"')
1637 {
1638 c = phase2_getc ();
1639 if (c != '"')
1640 {
1641 phase2_ungetc (c);
1642 break;
1643 }
1644 }
1645 /* No special treatment of newline and backslash here. */
1646 mixed_string_buffer_append_unicode (&literal, c);
1647 }
1648 tp->mixed_string = mixed_string_buffer_result (&literal);
1649 tp->comment = add_reference (savable_comment);
1650 lexical_context = lc_outside;
1651 tp->type = token_type_string_literal;
1652 return;
1653 }
1654 /* FALLTHROUGH, so that @identifier is recognized. */
1655
1656 default:
1657 if (c == '\\')
1658 c = do_getc_unicode_escaped (is_identifier_start);
1659 if (is_identifier_start (c))
1660 {
1661 struct mixed_string_buffer buffer;
1662 mixed_string_ty *mixed_string;
1663
1664 mixed_string_buffer_init (&buffer, lexical_context,
1665 logical_file_name, logical_line_number);
1666 for (;;)
1667 {
1668 mixed_string_buffer_append_unicode (&buffer, c);
1669 c = phase4_getc ();
1670 if (c == '\\')
1671 c = do_getc_unicode_escaped (is_identifier_part);
1672 if (!is_identifier_part (c))
1673 break;
1674 }
1675 phase4_ungetc (c);
1676 mixed_string = mixed_string_buffer_result (&buffer);
1677 tp->string = mixed_string_contents (mixed_string);
1678 mixed_string_free (mixed_string);
1679 tp->type = token_type_symbol;
1680 return;
1681 }
1682 else
1683 {
1684 /* Misc. operator. */
1685 tp->type = token_type_other;
1686 return;
1687 }
1688 }
1689 }
1690 }
1691
1692 /* Supports 3 tokens of pushback. */
1693 static void
phase6_unget(token_ty * tp)1694 phase6_unget (token_ty *tp)
1695 {
1696 if (tp->type != token_type_eof)
1697 {
1698 if (phase6_pushback_length == SIZEOF (phase6_pushback))
1699 abort ();
1700 phase6_pushback[phase6_pushback_length++] = *tp;
1701 }
1702 }
1703
1704
1705 /* Compile-time optimization of string literal concatenation.
1706 Combine "string1" + ... + "stringN" to the concatenated string if
1707 - the token after this expression is not '.' (because then the last
1708 string could be part of a method call expression). */
1709
1710 static token_ty phase7_pushback[2];
1711 static int phase7_pushback_length;
1712
1713 static void
phase7_get(token_ty * tp)1714 phase7_get (token_ty *tp)
1715 {
1716 if (phase7_pushback_length)
1717 {
1718 *tp = phase7_pushback[--phase7_pushback_length];
1719 return;
1720 }
1721
1722 phase6_get (tp);
1723 if (tp->type == token_type_string_literal)
1724 {
1725 mixed_string_ty *sum = tp->mixed_string;
1726
1727 for (;;)
1728 {
1729 token_ty token2;
1730
1731 phase6_get (&token2);
1732 if (token2.type == token_type_plus)
1733 {
1734 token_ty token3;
1735
1736 phase6_get (&token3);
1737 if (token3.type == token_type_string_literal)
1738 {
1739 token_ty token_after;
1740
1741 phase6_get (&token_after);
1742 if (token_after.type != token_type_dot)
1743 {
1744 sum = mixed_string_concat_free1 (sum, token3.mixed_string);
1745
1746 phase6_unget (&token_after);
1747 free_token (&token3);
1748 free_token (&token2);
1749 continue;
1750 }
1751 phase6_unget (&token_after);
1752 }
1753 phase6_unget (&token3);
1754 }
1755 phase6_unget (&token2);
1756 break;
1757 }
1758 tp->mixed_string = sum;
1759 }
1760 }
1761
1762 /* Supports 2 tokens of pushback. */
1763 static void
phase7_unget(token_ty * tp)1764 phase7_unget (token_ty *tp)
1765 {
1766 if (tp->type != token_type_eof)
1767 {
1768 if (phase7_pushback_length == SIZEOF (phase7_pushback))
1769 abort ();
1770 phase7_pushback[phase7_pushback_length++] = *tp;
1771 }
1772 }
1773
1774
1775 static void
x_csharp_lex(token_ty * tp)1776 x_csharp_lex (token_ty *tp)
1777 {
1778 phase7_get (tp);
1779 }
1780
1781 /* Supports 2 tokens of pushback. */
1782 static void
x_csharp_unlex(token_ty * tp)1783 x_csharp_unlex (token_ty *tp)
1784 {
1785 phase7_unget (tp);
1786 }
1787
1788
1789 /* ========================= Extracting strings. ========================== */
1790
1791
1792 /* Context lookup table. */
1793 static flag_context_list_table_ty *flag_context_list_table;
1794
1795
1796 /* The file is broken into tokens. Scan the token stream, looking for
1797 a keyword, followed by a left paren, followed by a string. When we
1798 see this sequence, we have something to remember. We assume we are
1799 looking at a valid C or C++ program, and leave the complaints about
1800 the grammar to the compiler.
1801
1802 Normal handling: Look for
1803 keyword ( ... msgid ... )
1804 Plural handling: Look for
1805 keyword ( ... msgid ... msgid_plural ... )
1806
1807 We use recursion because the arguments before msgid or between msgid
1808 and msgid_plural can contain subexpressions of the same form. */
1809
1810
1811 /* Extract messages until the next balanced closing parenthesis or brace,
1812 depending on TERMINATOR.
1813 Extracted messages are added to MLP.
1814 Return true upon eof, false upon closing parenthesis or brace. */
1815 static bool
extract_parenthesized(message_list_ty * mlp,token_type_ty terminator,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1816 extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1817 flag_context_ty outer_context,
1818 flag_context_list_iterator_ty context_iter,
1819 struct arglist_parser *argparser)
1820 {
1821 /* Current argument number. */
1822 int arg = 1;
1823 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1824 int state;
1825 /* Parameters of the keyword just seen. Defined only in state 1. */
1826 const struct callshapes *next_shapes = NULL;
1827 /* Context iterator that will be used if the next token is a '('. */
1828 flag_context_list_iterator_ty next_context_iter =
1829 passthrough_context_list_iterator;
1830 /* Current context. */
1831 flag_context_ty inner_context =
1832 inherited_context (outer_context,
1833 flag_context_list_iterator_advance (&context_iter));
1834
1835 /* Start state is 0. */
1836 state = 0;
1837
1838 for (;;)
1839 {
1840 token_ty token;
1841
1842 x_csharp_lex (&token);
1843 switch (token.type)
1844 {
1845 case token_type_symbol:
1846 {
1847 /* Combine symbol1 . ... . symbolN to a single strings, so that
1848 we can recognize static function calls like
1849 GettextResource.gettext. The information present for
1850 symbolI.....symbolN has precedence over the information for
1851 symbolJ.....symbolN with J > I. */
1852 char *sum = token.string;
1853 size_t sum_len = strlen (sum);
1854 const char *dottedname;
1855 flag_context_list_ty *context_list;
1856
1857 for (;;)
1858 {
1859 token_ty token2;
1860
1861 x_csharp_lex (&token2);
1862 if (token2.type == token_type_dot)
1863 {
1864 token_ty token3;
1865
1866 x_csharp_lex (&token3);
1867 if (token3.type == token_type_symbol)
1868 {
1869 char *addend = token3.string;
1870 size_t addend_len = strlen (addend);
1871
1872 sum =
1873 (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1874 sum[sum_len] = '.';
1875 memcpy (sum + sum_len + 1, addend, addend_len + 1);
1876 sum_len += 1 + addend_len;
1877
1878 free_token (&token3);
1879 free_token (&token2);
1880 continue;
1881 }
1882 x_csharp_unlex (&token3);
1883 }
1884 x_csharp_unlex (&token2);
1885 break;
1886 }
1887
1888 for (dottedname = sum;;)
1889 {
1890 void *keyword_value;
1891
1892 if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
1893 &keyword_value)
1894 == 0)
1895 {
1896 next_shapes = (const struct callshapes *) keyword_value;
1897 state = 1;
1898 break;
1899 }
1900
1901 dottedname = strchr (dottedname, '.');
1902 if (dottedname == NULL)
1903 {
1904 state = 0;
1905 break;
1906 }
1907 dottedname++;
1908 }
1909
1910 for (dottedname = sum;;)
1911 {
1912 context_list =
1913 flag_context_list_table_lookup (
1914 flag_context_list_table,
1915 dottedname, strlen (dottedname));
1916 if (context_list != NULL)
1917 break;
1918
1919 dottedname = strchr (dottedname, '.');
1920 if (dottedname == NULL)
1921 break;
1922 dottedname++;
1923 }
1924 next_context_iter = flag_context_list_iterator (context_list);
1925
1926 free (sum);
1927 continue;
1928 }
1929
1930 case token_type_lparen:
1931 if (extract_parenthesized (mlp, token_type_rparen,
1932 inner_context, next_context_iter,
1933 arglist_parser_alloc (mlp,
1934 state ? next_shapes : NULL)))
1935 {
1936 arglist_parser_done (argparser, arg);
1937 return true;
1938 }
1939 next_context_iter = null_context_list_iterator;
1940 state = 0;
1941 continue;
1942
1943 case token_type_rparen:
1944 if (terminator == token_type_rparen)
1945 {
1946 arglist_parser_done (argparser, arg);
1947 return false;
1948 }
1949 if (terminator == token_type_rbrace)
1950 {
1951 error_with_progname = false;
1952 error (0, 0,
1953 _("%s:%d: warning: ')' found where '}' was expected"),
1954 logical_file_name, token.line_number);
1955 error_with_progname = true;
1956 }
1957 next_context_iter = null_context_list_iterator;
1958 state = 0;
1959 continue;
1960
1961 case token_type_lbrace:
1962 if (extract_parenthesized (mlp, token_type_rbrace,
1963 null_context, null_context_list_iterator,
1964 arglist_parser_alloc (mlp, NULL)))
1965 {
1966 arglist_parser_done (argparser, arg);
1967 return true;
1968 }
1969 next_context_iter = null_context_list_iterator;
1970 state = 0;
1971 continue;
1972
1973 case token_type_rbrace:
1974 if (terminator == token_type_rbrace)
1975 {
1976 arglist_parser_done (argparser, arg);
1977 return false;
1978 }
1979 if (terminator == token_type_rparen)
1980 {
1981 error_with_progname = false;
1982 error (0, 0,
1983 _("%s:%d: warning: '}' found where ')' was expected"),
1984 logical_file_name, token.line_number);
1985 error_with_progname = true;
1986 }
1987 next_context_iter = null_context_list_iterator;
1988 state = 0;
1989 continue;
1990
1991 case token_type_comma:
1992 arg++;
1993 inner_context =
1994 inherited_context (outer_context,
1995 flag_context_list_iterator_advance (
1996 &context_iter));
1997 next_context_iter = passthrough_context_list_iterator;
1998 state = 0;
1999 continue;
2000
2001 case token_type_string_literal:
2002 {
2003 lex_pos_ty pos;
2004
2005 pos.file_name = logical_file_name;
2006 pos.line_number = token.line_number;
2007
2008 if (extract_all)
2009 {
2010 char *string = mixed_string_contents (token.mixed_string);
2011 mixed_string_free (token.mixed_string);
2012 remember_a_message (mlp, NULL, string, true, false,
2013 inner_context, &pos,
2014 NULL, token.comment, true);
2015 }
2016 else
2017 arglist_parser_remember (argparser, arg, token.mixed_string,
2018 inner_context,
2019 pos.file_name, pos.line_number,
2020 token.comment, true);
2021 }
2022 drop_reference (token.comment);
2023 next_context_iter = null_context_list_iterator;
2024 state = 0;
2025 continue;
2026
2027 case token_type_eof:
2028 arglist_parser_done (argparser, arg);
2029 return true;
2030
2031 case token_type_dot:
2032 case token_type_number:
2033 case token_type_plus:
2034 case token_type_other:
2035 next_context_iter = null_context_list_iterator;
2036 state = 0;
2037 continue;
2038
2039 default:
2040 abort ();
2041 }
2042 }
2043 }
2044
2045
2046 void
extract_csharp(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)2047 extract_csharp (FILE *f,
2048 const char *real_filename, const char *logical_filename,
2049 flag_context_list_table_ty *flag_table,
2050 msgdomain_list_ty *mdlp)
2051 {
2052 message_list_ty *mlp = mdlp->item[0]->messages;
2053
2054 fp = f;
2055 real_file_name = real_filename;
2056 logical_file_name = xstrdup (logical_filename);
2057 line_number = 1;
2058
2059 phase1_pushback_length = 0;
2060
2061 lexical_context = lc_outside;
2062
2063 phase2_pushback_length = 0;
2064
2065 logical_line_number = 1;
2066
2067 phase3_pushback_length = 0;
2068
2069 last_comment_line = -1;
2070 last_non_comment_line = -1;
2071
2072 phase5_pushback_length = 0;
2073 phase6_pushback_length = 0;
2074 phase7_pushback_length = 0;
2075
2076 flag_context_list_table = flag_table;
2077
2078 init_keywords ();
2079
2080 /* Eat tokens until eof is seen. When extract_parenthesized returns
2081 due to an unbalanced closing parenthesis, just restart it. */
2082 while (!extract_parenthesized (mlp, token_type_eof,
2083 null_context, null_context_list_iterator,
2084 arglist_parser_alloc (mlp, NULL)))
2085 ;
2086
2087 fp = NULL;
2088 real_file_name = NULL;
2089 logical_file_name = NULL;
2090 line_number = 0;
2091 }
2092