1 /* Conversion loop frame work.
2 Copyright (C) 1998-2014 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 /* This file provides a frame for the reader loop in all conversion modules.
21 The actual code must (of course) be provided in the actual module source
22 code but certain actions can be written down generically, with some
23 customization options which are these:
24
25 MIN_NEEDED_INPUT minimal number of input bytes needed for the next
26 conversion.
27 MIN_NEEDED_OUTPUT minimal number of bytes produced by the next round
28 of conversion.
29
30 MAX_NEEDED_INPUT you guess it, this is the maximal number of input
31 bytes needed. It defaults to MIN_NEEDED_INPUT
32 MAX_NEEDED_OUTPUT likewise for output bytes.
33
34 LOOPFCT name of the function created. If not specified
35 the name is `loop' but this prevents the use
36 of multiple functions in the same file.
37
38 BODY this is supposed to expand to the body of the loop.
39 The user must provide this.
40
41 EXTRA_LOOP_DECLS extra arguments passed from conversion loop call.
42
43 INIT_PARAMS code to define and initialize variables from params.
44 UPDATE_PARAMS code to store result in params.
45
46 ONEBYTE_BODY body of the specialized conversion function for a
47 single byte from the current character set to INTERNAL.
48 */
49
50 #include <assert.h>
51 #include <endian.h>
52 #include <gconv.h>
53 #include <stdint.h>
54 #include <string.h>
55 #include <wchar.h>
56 #include <sys/param.h> /* For MIN. */
57 #define __need_size_t
58 #include <stddef.h>
59
60 extern int __gconv_transliterate (struct __gconv_step *step,
61 struct __gconv_step_data *step_data,
62 const unsigned char *inbufstart,
63 const unsigned char **inbufp,
64 const unsigned char *inbufend,
65 unsigned char **outbufstart,
66 size_t *irreversible);
67
68 /* We have to provide support for machines which are not able to handled
69 unaligned memory accesses. Some of the character encodings have
70 representations with a fixed width of 2 or 4 bytes. But if we cannot
71 access unaligned memory we still have to read byte-wise. */
72 #undef FCTNAME2
73 #if _STRING_ARCH_unaligned || !defined DEFINE_UNALIGNED
74 /* We can handle unaligned memory access. */
75 # define get16(addr) *((const uint16_t *) (addr))
76 # define get32(addr) *((const uint32_t *) (addr))
77
78 /* We need no special support for writing values either. */
79 # define put16(addr, val) *((uint16_t *) (addr)) = (val)
80 # define put32(addr, val) *((uint32_t *) (addr)) = (val)
81
82 # define FCTNAME2(name) name
83 #else
84 /* Distinguish between big endian and little endian. */
85 # if __BYTE_ORDER == __LITTLE_ENDIAN
86 # define get16(addr) \
87 (((const unsigned char *) (addr))[1] << 8 \
88 | ((const unsigned char *) (addr))[0])
89 # define get32(addr) \
90 (((((const unsigned char *) (addr))[3] << 8 \
91 | ((const unsigned char *) (addr))[2]) << 8 \
92 | ((const unsigned char *) (addr))[1]) << 8 \
93 | ((const unsigned char *) (addr))[0])
94
95 # define put16(addr, val) \
96 ({ uint16_t __val = (val); \
97 ((unsigned char *) (addr))[0] = __val; \
98 ((unsigned char *) (addr))[1] = __val >> 8; \
99 (void) 0; })
100 # define put32(addr, val) \
101 ({ uint32_t __val = (val); \
102 ((unsigned char *) (addr))[0] = __val; \
103 __val >>= 8; \
104 ((unsigned char *) (addr))[1] = __val; \
105 __val >>= 8; \
106 ((unsigned char *) (addr))[2] = __val; \
107 __val >>= 8; \
108 ((unsigned char *) (addr))[3] = __val; \
109 (void) 0; })
110 # else
111 # define get16(addr) \
112 (((const unsigned char *) (addr))[0] << 8 \
113 | ((const unsigned char *) (addr))[1])
114 # define get32(addr) \
115 (((((const unsigned char *) (addr))[0] << 8 \
116 | ((const unsigned char *) (addr))[1]) << 8 \
117 | ((const unsigned char *) (addr))[2]) << 8 \
118 | ((const unsigned char *) (addr))[3])
119
120 # define put16(addr, val) \
121 ({ uint16_t __val = (val); \
122 ((unsigned char *) (addr))[1] = __val; \
123 ((unsigned char *) (addr))[0] = __val >> 8; \
124 (void) 0; })
125 # define put32(addr, val) \
126 ({ uint32_t __val = (val); \
127 ((unsigned char *) (addr))[3] = __val; \
128 __val >>= 8; \
129 ((unsigned char *) (addr))[2] = __val; \
130 __val >>= 8; \
131 ((unsigned char *) (addr))[1] = __val; \
132 __val >>= 8; \
133 ((unsigned char *) (addr))[0] = __val; \
134 (void) 0; })
135 # endif
136
137 # define FCTNAME2(name) name##_unaligned
138 #endif
139 #define FCTNAME(name) FCTNAME2(name)
140
141
142 /* We need at least one byte for the next round. */
143 #ifndef MIN_NEEDED_INPUT
144 # error "MIN_NEEDED_INPUT definition missing"
145 #elif MIN_NEEDED_INPUT < 1
146 # error "MIN_NEEDED_INPUT must be >= 1"
147 #endif
148
149 /* Let's see how many bytes we produce. */
150 #ifndef MAX_NEEDED_INPUT
151 # define MAX_NEEDED_INPUT MIN_NEEDED_INPUT
152 #endif
153
154 /* We produce at least one byte in the next round. */
155 #ifndef MIN_NEEDED_OUTPUT
156 # error "MIN_NEEDED_OUTPUT definition missing"
157 #elif MIN_NEEDED_OUTPUT < 1
158 # error "MIN_NEEDED_OUTPUT must be >= 1"
159 #endif
160
161 /* Let's see how many bytes we produce. */
162 #ifndef MAX_NEEDED_OUTPUT
163 # define MAX_NEEDED_OUTPUT MIN_NEEDED_OUTPUT
164 #endif
165
166 /* Default name for the function. */
167 #ifndef LOOPFCT
168 # define LOOPFCT loop
169 #endif
170
171 /* Make sure we have a loop body. */
172 #ifndef BODY
173 # error "Definition of BODY missing for function" LOOPFCT
174 #endif
175
176
177 /* If no arguments have to passed to the loop function define the macro
178 as empty. */
179 #ifndef EXTRA_LOOP_DECLS
180 # define EXTRA_LOOP_DECLS
181 #endif
182
183 /* Allow using UPDATE_PARAMS in macros where #ifdef UPDATE_PARAMS test
184 isn't possible. */
185 #ifndef UPDATE_PARAMS
186 # define UPDATE_PARAMS do { } while (0)
187 #endif
188 #ifndef REINIT_PARAMS
189 # define REINIT_PARAMS do { } while (0)
190 #endif
191
192
193 /* To make it easier for the writers of the modules, we define a macro
194 to test whether we have to ignore errors. */
195 #define ignore_errors_p() \
196 (irreversible != NULL && (flags & __GCONV_IGNORE_ERRORS))
197
198
199 /* Error handling for the FROM_LOOP direction, with ignoring of errors.
200 Note that we cannot use the do while (0) trick since `break' and
201 `continue' must reach certain points. */
202 #define STANDARD_FROM_LOOP_ERR_HANDLER(Incr) \
203 { \
204 result = __GCONV_ILLEGAL_INPUT; \
205 \
206 if (! ignore_errors_p ()) \
207 break; \
208 \
209 /* We ignore the invalid input byte sequence. */ \
210 inptr += (Incr); \
211 ++*irreversible; \
212 /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \
213 that "iconv -c" must give the same exitcode as "iconv". */ \
214 continue; \
215 }
216
217 /* Error handling for the TO_LOOP direction, with use of transliteration/
218 transcription functions and ignoring of errors. Note that we cannot use
219 the do while (0) trick since `break' and `continue' must reach certain
220 points. */
221 #if ! __GLIBC_PREREQ(2,21)
222 #define STANDARD_TO_LOOP_ERR_HANDLER(Incr) \
223 { \
224 struct __gconv_trans_data *trans; \
225 \
226 result = __GCONV_ILLEGAL_INPUT; \
227 \
228 if (irreversible == NULL) \
229 /* This means we are in call from __gconv_transliterate. In this \
230 case we are not doing any error recovery outself. */ \
231 break; \
232 \
233 /* If needed, flush any conversion state, so that __gconv_transliterate \
234 starts with current shift state. */ \
235 UPDATE_PARAMS; \
236 \
237 /* First try the transliteration methods. */ \
238 for (trans = step_data->__trans; trans != NULL; trans = trans->__next) \
239 { \
240 result = DL_CALL_FCT (trans->__trans_fct, \
241 (step, step_data, trans->__data, *inptrp, \
242 &inptr, inend, &outptr, irreversible)); \
243 if (result != __GCONV_ILLEGAL_INPUT) \
244 break; \
245 } \
246 \
247 REINIT_PARAMS; \
248 \
249 /* If any of them recognized the input continue with the loop. */ \
250 if (result != __GCONV_ILLEGAL_INPUT) \
251 { \
252 if (__glibc_unlikely (result == __GCONV_FULL_OUTPUT)) \
253 break; \
254 \
255 continue; \
256 } \
257 \
258 /* Next see whether we have to ignore the error. If not, stop. */ \
259 if (! ignore_errors_p ()) \
260 break; \
261 \
262 /* When we come here it means we ignore the character. */ \
263 ++*irreversible; \
264 inptr += Incr; \
265 /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \
266 that "iconv -c" must give the same exitcode as "iconv". */ \
267 continue; \
268 }
269 #else
270 #define STANDARD_TO_LOOP_ERR_HANDLER(Incr) \
271 { \
272 result = __GCONV_ILLEGAL_INPUT; \
273 \
274 if (irreversible == NULL) \
275 /* This means we are in call from __gconv_transliterate. In this \
276 case we are not doing any error recovery outself. */ \
277 break; \
278 \
279 /* If needed, flush any conversion state, so that __gconv_transliterate \
280 starts with current shift state. */ \
281 UPDATE_PARAMS; \
282 \
283 /* First try the transliteration methods. */ \
284 if ((step_data->__flags & __GCONV_TRANSLIT) != 0) \
285 result = __gconv_transliterate \
286 (step, step_data, *inptrp, \
287 &inptr, inend, &outptr, irreversible); \
288 \
289 REINIT_PARAMS; \
290 \
291 /* If any of them recognized the input continue with the loop. */ \
292 if (result != __GCONV_ILLEGAL_INPUT) \
293 { \
294 if (__glibc_unlikely (result == __GCONV_FULL_OUTPUT)) \
295 break; \
296 \
297 continue; \
298 } \
299 \
300 /* Next see whether we have to ignore the error. If not, stop. */ \
301 if (! ignore_errors_p ()) \
302 break; \
303 \
304 /* When we come here it means we ignore the character. */ \
305 ++*irreversible; \
306 inptr += Incr; \
307 /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \
308 that "iconv -c" must give the same exitcode as "iconv". */ \
309 continue; \
310 }
311 #endif \
312
313
314 /* Handling of Unicode 3.1 TAG characters. Unicode recommends
315 "If language codes are not relevant to the particular processing
316 operation, then they should be ignored." This macro is usually
317 called right before STANDARD_TO_LOOP_ERR_HANDLER (Incr). */
318 #define UNICODE_TAG_HANDLER(Character, Incr) \
319 { \
320 /* TAG characters are those in the range U+E0000..U+E007F. */ \
321 if (((Character) >> 7) == (0xe0000 >> 7)) \
322 { \
323 inptr += Incr; \
324 continue; \
325 } \
326 }
327
328
329 /* The function returns the status, as defined in gconv.h. */
330 static inline int
331 __attribute ((always_inline))
FCTNAME(LOOPFCT)332 FCTNAME (LOOPFCT) (struct __gconv_step *step,
333 struct __gconv_step_data *step_data,
334 const unsigned char **inptrp, const unsigned char *inend,
335 unsigned char **outptrp, const unsigned char *outend,
336 size_t *irreversible EXTRA_LOOP_DECLS)
337 {
338 #ifdef LOOP_NEED_STATE
339 mbstate_t *state = step_data->__statep;
340 #endif
341 #ifdef LOOP_NEED_FLAGS
342 int flags = step_data->__flags;
343 #endif
344 #ifdef LOOP_NEED_DATA
345 void *data = step->__data;
346 #endif
347 int result = __GCONV_EMPTY_INPUT;
348 const unsigned char *inptr = *inptrp;
349 unsigned char *outptr = *outptrp;
350
351 #ifdef INIT_PARAMS
352 INIT_PARAMS;
353 #endif
354
355 while (inptr != inend)
356 {
357 /* `if' cases for MIN_NEEDED_OUTPUT ==/!= 1 is made to help the
358 compiler generating better code. They will be optimized away
359 since MIN_NEEDED_OUTPUT is always a constant. */
360 if (MIN_NEEDED_INPUT > 1
361 && __builtin_expect (inptr + MIN_NEEDED_INPUT > inend, 0))
362 {
363 /* We don't have enough input for another complete input
364 character. */
365 result = __GCONV_INCOMPLETE_INPUT;
366 break;
367 }
368 if ((MIN_NEEDED_OUTPUT != 1
369 && __builtin_expect (outptr + MIN_NEEDED_OUTPUT > outend, 0))
370 || (MIN_NEEDED_OUTPUT == 1
371 && __builtin_expect (outptr >= outend, 0)))
372 {
373 /* Overflow in the output buffer. */
374 result = __GCONV_FULL_OUTPUT;
375 break;
376 }
377
378 /* Here comes the body the user provides. It can stop with
379 RESULT set to GCONV_INCOMPLETE_INPUT (if the size of the
380 input characters vary in size), GCONV_ILLEGAL_INPUT, or
381 GCONV_FULL_OUTPUT (if the output characters vary in size). */
382 BODY
383 }
384
385 /* Update the pointers pointed to by the parameters. */
386 *inptrp = inptr;
387 *outptrp = outptr;
388 UPDATE_PARAMS;
389
390 return result;
391 }
392
393
394 /* Include the file a second time to define the function to handle
395 unaligned access. */
396 #if !defined DEFINE_UNALIGNED && !_STRING_ARCH_unaligned \
397 && MIN_NEEDED_INPUT != 1 && MAX_NEEDED_INPUT % MIN_NEEDED_INPUT == 0 \
398 && MIN_NEEDED_OUTPUT != 1 && MAX_NEEDED_OUTPUT % MIN_NEEDED_OUTPUT == 0
399 # undef get16
400 # undef get32
401 # undef put16
402 # undef put32
403 # undef unaligned
404
405 # define DEFINE_UNALIGNED
406 # include "loop.c"
407 # undef DEFINE_UNALIGNED
408 #else
409 # if MAX_NEEDED_INPUT > 1
410 # define SINGLE(fct) SINGLE2 (fct)
411 # define SINGLE2(fct) fct##_single
412 static inline int
413 __attribute ((always_inline))
SINGLE(LOOPFCT)414 SINGLE(LOOPFCT) (struct __gconv_step *step,
415 struct __gconv_step_data *step_data,
416 const unsigned char **inptrp, const unsigned char *inend,
417 unsigned char **outptrp, unsigned char *outend,
418 size_t *irreversible EXTRA_LOOP_DECLS)
419 {
420 mbstate_t *state = step_data->__statep;
421 # ifdef LOOP_NEED_FLAGS
422 int flags = step_data->__flags;
423 # endif
424 # ifdef LOOP_NEED_DATA
425 void *data = step->__data;
426 # endif
427 int result = __GCONV_OK;
428 unsigned char bytebuf[MAX_NEEDED_INPUT];
429 const unsigned char *inptr = *inptrp;
430 unsigned char *outptr = *outptrp;
431 size_t inlen;
432
433 # ifdef INIT_PARAMS
434 INIT_PARAMS;
435 # endif
436
437 # ifdef UNPACK_BYTES
438 UNPACK_BYTES
439 # else
440 /* Add the bytes from the state to the input buffer. */
441 assert ((state->__count & 7) <= sizeof (state->__value));
442 for (inlen = 0; inlen < (size_t) (state->__count & 7); ++inlen)
443 bytebuf[inlen] = state->__value.__wchb[inlen];
444 # endif
445
446 /* Are there enough bytes in the input buffer? */
447 if (MIN_NEEDED_INPUT > 1
448 && __builtin_expect (inptr + (MIN_NEEDED_INPUT - inlen) > inend, 0))
449 {
450 *inptrp = inend;
451 # ifdef STORE_REST
452 while (inptr < inend)
453 bytebuf[inlen++] = *inptr++;
454
455 inptr = bytebuf;
456 inptrp = &inptr;
457 inend = &bytebuf[inlen];
458
459 STORE_REST
460 # else
461 /* We don't have enough input for another complete input
462 character. */
463 while (inptr < inend)
464 state->__value.__wchb[inlen++] = *inptr++;
465 # endif
466
467 return __GCONV_INCOMPLETE_INPUT;
468 }
469
470 /* Enough space in output buffer. */
471 if ((MIN_NEEDED_OUTPUT != 1 && outptr + MIN_NEEDED_OUTPUT > outend)
472 || (MIN_NEEDED_OUTPUT == 1 && outptr >= outend))
473 /* Overflow in the output buffer. */
474 return __GCONV_FULL_OUTPUT;
475
476 /* Now add characters from the normal input buffer. */
477 do
478 bytebuf[inlen++] = *inptr++;
479 while (inlen < MAX_NEEDED_INPUT && inptr < inend);
480
481 inptr = bytebuf;
482 inend = &bytebuf[inlen];
483
484 do
485 {
486 BODY
487 }
488 while (0);
489
490 /* Now we either have produced an output character and consumed all the
491 bytes from the state and at least one more, or the character is still
492 incomplete, or we have some other error (like illegal input character,
493 no space in output buffer). */
494 if (__glibc_likely (inptr != bytebuf))
495 {
496 /* We found a new character. */
497 assert (inptr - bytebuf > (state->__count & 7));
498
499 *inptrp += inptr - bytebuf - (state->__count & 7);
500 *outptrp = outptr;
501
502 result = __GCONV_OK;
503
504 /* Clear the state buffer. */
505 # ifdef CLEAR_STATE
506 CLEAR_STATE;
507 # else
508 state->__count &= ~7;
509 # endif
510 }
511 else if (result == __GCONV_INCOMPLETE_INPUT)
512 {
513 /* This can only happen if we have less than MAX_NEEDED_INPUT bytes
514 available. */
515 assert (inend != &bytebuf[MAX_NEEDED_INPUT]);
516
517 *inptrp += inend - bytebuf - (state->__count & 7);
518 # ifdef STORE_REST
519 inptrp = &inptr;
520
521 STORE_REST
522 # else
523 /* We don't have enough input for another complete input
524 character. */
525 assert (inend - inptr > (state->__count & ~7));
526 assert (inend - inptr <= sizeof (state->__value));
527 state->__count = (state->__count & ~7) | (inend - inptr);
528 inlen = 0;
529 while (inptr < inend)
530 state->__value.__wchb[inlen++] = *inptr++;
531 # endif
532 }
533
534 return result;
535 }
536 # undef SINGLE
537 # undef SINGLE2
538 # endif
539
540
541 # ifdef ONEBYTE_BODY
542 /* Define the shortcut function for btowc. */
543 static wint_t
544 gconv_btowc (struct __gconv_step *step, unsigned char c)
545 ONEBYTE_BODY
546 # define FROM_ONEBYTE gconv_btowc
547 # endif
548
549 #endif
550
551 /* We remove the macro definitions so that we can include this file again
552 for the definition of another function. */
553 #undef MIN_NEEDED_INPUT
554 #undef MAX_NEEDED_INPUT
555 #undef MIN_NEEDED_OUTPUT
556 #undef MAX_NEEDED_OUTPUT
557 #undef LOOPFCT
558 #undef BODY
559 #undef LOOPFCT
560 #undef EXTRA_LOOP_DECLS
561 #undef INIT_PARAMS
562 #undef UPDATE_PARAMS
563 #undef REINIT_PARAMS
564 #undef ONEBYTE_BODY
565 #undef UNPACK_BYTES
566 #undef CLEAR_STATE
567 #undef LOOP_NEED_STATE
568 #undef LOOP_NEED_FLAGS
569 #undef LOOP_NEED_DATA
570 #undef get16
571 #undef get32
572 #undef put16
573 #undef put32
574 #undef unaligned
575