1 /* Reading binary .mo files.
2 Copyright (C) 1995-1998, 2000-2007, 2014-2015, 2017, 2020 Free Software Foundation, Inc.
3 Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, April 1995.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21
22 /* Specification. */
23 #include "read-mo.h"
24
25 #include <errno.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stddef.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 /* These two include files describe the binary .mo format. */
33 #include "gmo.h"
34 #include "hash-string.h"
35
36 #include "error.h"
37 #include "xalloc.h"
38 #include "binary-io.h"
39 #include "message.h"
40 #include "format.h"
41 #include "gettext.h"
42 #include "xsize.h"
43
44 #define _(str) gettext (str)
45
46
47 enum mo_endianness
48 {
49 MO_LITTLE_ENDIAN,
50 MO_BIG_ENDIAN
51 };
52
53 /* We read the file completely into memory. This is more efficient than
54 lots of lseek(). This struct represents the .mo file in memory. */
55 struct binary_mo_file
56 {
57 const char *filename;
58 char *data;
59 size_t size;
60 enum mo_endianness endian;
61 };
62
63
64 /* Read the contents of the given input stream. */
65 static void
read_binary_mo_file(struct binary_mo_file * bfp,FILE * fp,const char * filename)66 read_binary_mo_file (struct binary_mo_file *bfp,
67 FILE *fp, const char *filename)
68 {
69 char *buf = NULL;
70 size_t alloc = 0;
71 size_t size = 0;
72 size_t count;
73
74 while (!feof (fp))
75 {
76 const size_t increment = 4096;
77 if (size + increment > alloc)
78 {
79 alloc = alloc + alloc / 2;
80 if (alloc < size + increment)
81 alloc = size + increment;
82 buf = (char *) xrealloc (buf, alloc);
83 }
84 count = fread (buf + size, 1, increment, fp);
85 if (count == 0)
86 {
87 if (ferror (fp))
88 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
89 filename);
90 }
91 else
92 size += count;
93 }
94 buf = (char *) xrealloc (buf, size);
95 bfp->filename = filename;
96 bfp->data = buf;
97 bfp->size = size;
98 }
99
100 /* Get a 32-bit number from the file, at the given file position. */
101 static nls_uint32
get_uint32(const struct binary_mo_file * bfp,size_t offset)102 get_uint32 (const struct binary_mo_file *bfp, size_t offset)
103 {
104 nls_uint32 b0, b1, b2, b3;
105 size_t end = xsum (offset, 4);
106
107 if (size_overflow_p (end) || end > bfp->size)
108 error (EXIT_FAILURE, 0, _("file \"%s\" is truncated"), bfp->filename);
109
110 b0 = *(unsigned char *) (bfp->data + offset + 0);
111 b1 = *(unsigned char *) (bfp->data + offset + 1);
112 b2 = *(unsigned char *) (bfp->data + offset + 2);
113 b3 = *(unsigned char *) (bfp->data + offset + 3);
114 if (bfp->endian == MO_LITTLE_ENDIAN)
115 return b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
116 else
117 return (b0 << 24) | (b1 << 16) | (b2 << 8) | b3;
118 }
119
120 /* Get a static string from the file, at the given file position. */
121 static char *
get_string(const struct binary_mo_file * bfp,size_t offset,size_t * lengthp)122 get_string (const struct binary_mo_file *bfp, size_t offset, size_t *lengthp)
123 {
124 /* See 'struct string_desc'. */
125 nls_uint32 s_length = get_uint32 (bfp, offset);
126 nls_uint32 s_offset = get_uint32 (bfp, offset + 4);
127 size_t s_end = xsum3 (s_offset, s_length, 1);
128
129 if (size_overflow_p (s_end) || s_end > bfp->size)
130 error (EXIT_FAILURE, 0, _("file \"%s\" is truncated"), bfp->filename);
131 if (bfp->data[s_offset + s_length] != '\0')
132 error (EXIT_FAILURE, 0,
133 _("file \"%s\" contains a not NUL terminated string"),
134 bfp->filename);
135
136 *lengthp = s_length + 1;
137 return bfp->data + s_offset;
138 }
139
140 /* Get a system dependent string from the file, at the given file position. */
141 static char *
get_sysdep_string(const struct binary_mo_file * bfp,size_t offset,const struct mo_file_header * header,size_t * lengthp)142 get_sysdep_string (const struct binary_mo_file *bfp, size_t offset,
143 const struct mo_file_header *header, size_t *lengthp)
144 {
145 /* See 'struct sysdep_string'. */
146 size_t length;
147 char *string;
148 size_t i;
149 char *p;
150 nls_uint32 s_offset;
151
152 /* Compute the length. */
153 s_offset = get_uint32 (bfp, offset);
154 length = 0;
155 for (i = 4; ; i += 8)
156 {
157 nls_uint32 segsize = get_uint32 (bfp, offset + i);
158 nls_uint32 sysdepref = get_uint32 (bfp, offset + i + 4);
159 nls_uint32 sysdep_segment_offset;
160 nls_uint32 ss_length;
161 nls_uint32 ss_offset;
162 size_t ss_end;
163 size_t s_end;
164 size_t n;
165
166 s_end = xsum (s_offset, segsize);
167 if (size_overflow_p (s_end) || s_end > bfp->size)
168 error (EXIT_FAILURE, 0, _("file \"%s\" is truncated"), bfp->filename);
169 length += segsize;
170 s_offset += segsize;
171
172 if (sysdepref == SEGMENTS_END)
173 {
174 /* The last static segment must end in a NUL. */
175 if (!(segsize > 0 && bfp->data[s_offset - 1] == '\0'))
176 /* Invalid. */
177 error (EXIT_FAILURE, 0,
178 _("file \"%s\" contains a not NUL terminated system dependent string"),
179 bfp->filename);
180 break;
181 }
182 if (sysdepref >= header->n_sysdep_segments)
183 /* Invalid. */
184 error (EXIT_FAILURE, 0, _("file \"%s\" is not in GNU .mo format"),
185 bfp->filename);
186 /* See 'struct sysdep_segment'. */
187 sysdep_segment_offset = header->sysdep_segments_offset + sysdepref * 8;
188 ss_length = get_uint32 (bfp, sysdep_segment_offset);
189 ss_offset = get_uint32 (bfp, sysdep_segment_offset + 4);
190 ss_end = xsum (ss_offset, ss_length);
191 if (size_overflow_p (ss_end) || ss_end > bfp->size)
192 error (EXIT_FAILURE, 0, _("file \"%s\" is truncated"), bfp->filename);
193 if (!(ss_length > 0 && bfp->data[ss_end - 1] == '\0'))
194 {
195 char location[30];
196 sprintf (location, "sysdep_segment[%u]", (unsigned int) sysdepref);
197 error (EXIT_FAILURE, 0,
198 _("file \"%s\" contains a not NUL terminated string, at %s"),
199 bfp->filename, location);
200 }
201 n = strlen (bfp->data + ss_offset);
202 length += (n > 1 ? 1 + n + 1 : n);
203 }
204
205 /* Allocate and fill the string. */
206 string = XNMALLOC (length, char);
207 p = string;
208 s_offset = get_uint32 (bfp, offset);
209 for (i = 4; ; i += 8)
210 {
211 nls_uint32 segsize = get_uint32 (bfp, offset + i);
212 nls_uint32 sysdepref = get_uint32 (bfp, offset + i + 4);
213 nls_uint32 sysdep_segment_offset;
214 nls_uint32 ss_length;
215 nls_uint32 ss_offset;
216 size_t n;
217
218 memcpy (p, bfp->data + s_offset, segsize);
219 p += segsize;
220 s_offset += segsize;
221
222 if (sysdepref == SEGMENTS_END)
223 break;
224 if (sysdepref >= header->n_sysdep_segments)
225 abort ();
226 /* See 'struct sysdep_segment'. */
227 sysdep_segment_offset = header->sysdep_segments_offset + sysdepref * 8;
228 ss_length = get_uint32 (bfp, sysdep_segment_offset);
229 ss_offset = get_uint32 (bfp, sysdep_segment_offset + 4);
230 if (ss_offset + ss_length > bfp->size)
231 abort ();
232 if (!(ss_length > 0 && bfp->data[ss_offset + ss_length - 1] == '\0'))
233 abort ();
234 n = strlen (bfp->data + ss_offset);
235 if (n > 1)
236 *p++ = '<';
237 memcpy (p, bfp->data + ss_offset, n);
238 p += n;
239 if (n > 1)
240 *p++ = '>';
241 }
242
243 if (p != string + length)
244 abort ();
245
246 *lengthp = length;
247 return string;
248 }
249
250 /* Reads an existing .mo file and adds the messages to mlp. */
251 void
read_mo_file(message_list_ty * mlp,const char * filename)252 read_mo_file (message_list_ty *mlp, const char *filename)
253 {
254 FILE *fp;
255 struct binary_mo_file bf;
256 struct mo_file_header header;
257 unsigned int i;
258 static lex_pos_ty pos = { __FILE__, __LINE__ };
259
260 if (strcmp (filename, "-") == 0 || strcmp (filename, "/dev/stdin") == 0)
261 {
262 fp = stdin;
263 SET_BINARY (fileno (fp));
264 }
265 else
266 {
267 fp = fopen (filename, "rb");
268 if (fp == NULL)
269 error (EXIT_FAILURE, errno,
270 _("error while opening \"%s\" for reading"), filename);
271 }
272
273 /* Read the file contents into memory. */
274 read_binary_mo_file (&bf, fp, filename);
275
276 /* Get a 32-bit number from the file header. */
277 # define GET_HEADER_FIELD(field) \
278 get_uint32 (&bf, offsetof (struct mo_file_header, field))
279
280 /* We must grope the file to determine which endian it is.
281 Perversity of the universe tends towards maximum, so it will
282 probably not match the currently executing architecture. */
283 bf.endian = MO_BIG_ENDIAN;
284 header.magic = GET_HEADER_FIELD (magic);
285 if (header.magic != _MAGIC)
286 {
287 bf.endian = MO_LITTLE_ENDIAN;
288 header.magic = GET_HEADER_FIELD (magic);
289 if (header.magic != _MAGIC)
290 {
291 unrecognised:
292 error (EXIT_FAILURE, 0, _("file \"%s\" is not in GNU .mo format"),
293 filename);
294 }
295 }
296
297 header.revision = GET_HEADER_FIELD (revision);
298
299 /* We support only the major revisions 0 and 1. */
300 switch (header.revision >> 16)
301 {
302 case 0:
303 case 1:
304 /* Fill the header parts that apply to major revisions 0 and 1. */
305 header.nstrings = GET_HEADER_FIELD (nstrings);
306 header.orig_tab_offset = GET_HEADER_FIELD (orig_tab_offset);
307 header.trans_tab_offset = GET_HEADER_FIELD (trans_tab_offset);
308 header.hash_tab_size = GET_HEADER_FIELD (hash_tab_size);
309 header.hash_tab_offset = GET_HEADER_FIELD (hash_tab_offset);
310
311 /* The following verifications attempt to ensure that 'msgunfmt' complains
312 about a .mo file that may make libintl crash at run time. */
313
314 /* Verify that the array of messages is sorted. */
315 {
316 char *prev_msgid = NULL;
317
318 for (i = 0; i < header.nstrings; i++)
319 {
320 char *msgid;
321 size_t msgid_len;
322
323 msgid = get_string (&bf, header.orig_tab_offset + i * 8,
324 &msgid_len);
325 if (i == 0)
326 prev_msgid = msgid;
327 else
328 {
329 if (!(strcmp (prev_msgid, msgid) < 0))
330 error (EXIT_FAILURE, 0,
331 _("file \"%s\" is not in GNU .mo format: The array of messages is not sorted."),
332 filename);
333 }
334 }
335 }
336
337 /* Verify the hash table. */
338 if (header.hash_tab_size > 0)
339 {
340 char *seen;
341 unsigned int j;
342
343 /* Verify the hash table's size. */
344 if (!(header.hash_tab_size > 2))
345 error (EXIT_FAILURE, 0,
346 _("file \"%s\" is not in GNU .mo format: The hash table size is invalid."),
347 filename);
348
349 /* Verify that the non-empty hash table entries contain the values
350 1, ..., nstrings, each exactly once. */
351 seen = (char *) xcalloc (header.nstrings, 1);
352 for (j = 0; j < header.hash_tab_size; j++)
353 {
354 nls_uint32 entry =
355 get_uint32 (&bf, header.hash_tab_offset + j * 4);
356
357 if (entry != 0)
358 {
359 i = entry - 1;
360 if (!(i < header.nstrings && seen[i] == 0))
361 error (EXIT_FAILURE, 0,
362 _("file \"%s\" is not in GNU .mo format: The hash table contains invalid entries."),
363 filename);
364 seen[i] = 1;
365 }
366 }
367 for (i = 0; i < header.nstrings; i++)
368 if (seen[i] == 0)
369 error (EXIT_FAILURE, 0, _("file \"%s\" is not in GNU .mo format: Some messages are not present in the hash table."),
370 filename);
371 free (seen);
372
373 /* Verify that the hash table lookup algorithm finds the entry for
374 each message. */
375 for (i = 0; i < header.nstrings; i++)
376 {
377 size_t msgid_len;
378 char *msgid = get_string (&bf, header.orig_tab_offset + i * 8,
379 &msgid_len);
380 nls_uint32 hash_val = hash_string (msgid);
381 nls_uint32 idx = hash_val % header.hash_tab_size;
382 nls_uint32 incr = 1 + (hash_val % (header.hash_tab_size - 2));
383 for (;;)
384 {
385 nls_uint32 entry =
386 get_uint32 (&bf, header.hash_tab_offset + idx * 4);
387
388 if (entry == 0)
389 error (EXIT_FAILURE, 0,
390 _("file \"%s\" is not in GNU .mo format: Some messages are at a wrong index in the hash table."),
391 filename);
392 if (entry == i + 1)
393 break;
394
395 if (idx >= header.hash_tab_size - incr)
396 idx -= header.hash_tab_size - incr;
397 else
398 idx += incr;
399 }
400 }
401 }
402
403 for (i = 0; i < header.nstrings; i++)
404 {
405 message_ty *mp;
406 char *msgctxt;
407 char *msgid;
408 size_t msgid_len;
409 char *separator;
410 char *msgstr;
411 size_t msgstr_len;
412
413 /* Read the msgctxt and msgid. */
414 msgid = get_string (&bf, header.orig_tab_offset + i * 8,
415 &msgid_len);
416 /* Split into msgctxt and msgid. */
417 separator = strchr (msgid, MSGCTXT_SEPARATOR);
418 if (separator != NULL)
419 {
420 /* The part before the MSGCTXT_SEPARATOR is the msgctxt. */
421 *separator = '\0';
422 msgctxt = msgid;
423 msgid = separator + 1;
424 msgid_len -= msgid - msgctxt;
425 }
426 else
427 msgctxt = NULL;
428
429 /* Read the msgstr. */
430 msgstr = get_string (&bf, header.trans_tab_offset + i * 8,
431 &msgstr_len);
432
433 mp = message_alloc (msgctxt,
434 msgid,
435 (strlen (msgid) + 1 < msgid_len
436 ? msgid + strlen (msgid) + 1
437 : NULL),
438 msgstr, msgstr_len,
439 &pos);
440 message_list_append (mlp, mp);
441 }
442
443 switch (header.revision & 0xffff)
444 {
445 case 0:
446 break;
447 case 1:
448 default:
449 /* Fill the header parts that apply to minor revision >= 1. */
450 header.n_sysdep_segments = GET_HEADER_FIELD (n_sysdep_segments);
451 header.sysdep_segments_offset =
452 GET_HEADER_FIELD (sysdep_segments_offset);
453 header.n_sysdep_strings = GET_HEADER_FIELD (n_sysdep_strings);
454 header.orig_sysdep_tab_offset =
455 GET_HEADER_FIELD (orig_sysdep_tab_offset);
456 header.trans_sysdep_tab_offset =
457 GET_HEADER_FIELD (trans_sysdep_tab_offset);
458
459 for (i = 0; i < header.n_sysdep_strings; i++)
460 {
461 message_ty *mp;
462 char *msgctxt;
463 char *msgid;
464 size_t msgid_len;
465 char *separator;
466 char *msgstr;
467 size_t msgstr_len;
468 nls_uint32 offset;
469 size_t f;
470
471 /* Read the msgctxt and msgid. */
472 offset = get_uint32 (&bf, header.orig_sysdep_tab_offset + i * 4);
473 msgid = get_sysdep_string (&bf, offset, &header, &msgid_len);
474 /* Split into msgctxt and msgid. */
475 separator = strchr (msgid, MSGCTXT_SEPARATOR);
476 if (separator != NULL)
477 {
478 /* The part before the MSGCTXT_SEPARATOR is the msgctxt. */
479 *separator = '\0';
480 msgctxt = msgid;
481 msgid = separator + 1;
482 msgid_len -= msgid - msgctxt;
483 }
484 else
485 msgctxt = NULL;
486
487 /* Read the msgstr. */
488 offset = get_uint32 (&bf, header.trans_sysdep_tab_offset + i * 4);
489 msgstr = get_sysdep_string (&bf, offset, &header, &msgstr_len);
490
491 mp = message_alloc (msgctxt,
492 msgid,
493 (strlen (msgid) + 1 < msgid_len
494 ? msgid + strlen (msgid) + 1
495 : NULL),
496 msgstr, msgstr_len,
497 &pos);
498
499 /* Only messages with c-format or objc-format annotation are
500 recognized as having system-dependent strings by msgfmt.
501 Which one of the two, we don't know. We have to guess,
502 assuming that c-format is more probable than objc-format and
503 that the .mo was likely produced by "msgfmt -c". */
504 for (f = format_c; ; f = format_objc)
505 {
506 bool valid = true;
507 struct formatstring_parser *parser = formatstring_parsers[f];
508 const char *str_end;
509 const char *str;
510
511 str_end = msgid + msgid_len;
512 for (str = msgid; str < str_end; str += strlen (str) + 1)
513 {
514 char *invalid_reason = NULL;
515 void *descr =
516 parser->parse (str, false, NULL, &invalid_reason);
517
518 if (descr != NULL)
519 parser->free (descr);
520 else
521 {
522 free (invalid_reason);
523 valid = false;
524 break;
525 }
526 }
527 if (valid)
528 {
529 str_end = msgstr + msgstr_len;
530 for (str = msgstr; str < str_end; str += strlen (str) + 1)
531 {
532 char *invalid_reason = NULL;
533 void *descr =
534 parser->parse (str, true, NULL, &invalid_reason);
535
536 if (descr != NULL)
537 parser->free (descr);
538 else
539 {
540 free (invalid_reason);
541 valid = false;
542 break;
543 }
544 }
545 }
546
547 if (valid)
548 {
549 /* Found the most likely among c-format, objc-format. */
550 mp->is_format[f] = yes;
551 break;
552 }
553
554 /* Try next f. */
555 if (f == format_objc)
556 break;
557 }
558
559 message_list_append (mlp, mp);
560 }
561 break;
562 }
563 break;
564
565 default:
566 goto unrecognised;
567 }
568
569 if (fp != stdin)
570 fclose (fp);
571 }
572