• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Convert multibyte character to wide character.
2    Copyright (C) 1999-2002, 2005-2009 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2008.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17 
18 #include <config.h>
19 
20 /* Specification.  */
21 #include <wchar.h>
22 
23 #if GNULIB_defined_mbstate_t
24 /* Implement mbrtowc() on top of mbtowc().  */
25 
26 # include <errno.h>
27 # include <stdlib.h>
28 
29 # include "localcharset.h"
30 # include "streq.h"
31 # include "verify.h"
32 
33 
34 verify (sizeof (mbstate_t) >= 4);
35 
36 static char internal_state[4];
37 
38 size_t
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)39 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
40 {
41   char *pstate = (char *)ps;
42 
43   if (pstate == NULL)
44     pstate = internal_state;
45 
46   if (s == NULL)
47     {
48       pwc = NULL;
49       s = "";
50       n = 1;
51     }
52 
53   if (n == 0)
54     return (size_t)(-2);
55 
56   /* Here n > 0.  */
57   {
58     size_t nstate = pstate[0];
59     char buf[4];
60     const char *p;
61     size_t m;
62 
63     switch (nstate)
64       {
65       case 0:
66 	p = s;
67 	m = n;
68 	break;
69       case 3:
70 	buf[2] = pstate[3];
71 	/*FALLTHROUGH*/
72       case 2:
73 	buf[1] = pstate[2];
74 	/*FALLTHROUGH*/
75       case 1:
76 	buf[0] = pstate[1];
77 	p = buf;
78 	m = nstate;
79 	buf[m++] = s[0];
80 	if (n >= 2 && m < 4)
81 	  {
82 	    buf[m++] = s[1];
83 	    if (n >= 3 && m < 4)
84 	      buf[m++] = s[2];
85 	  }
86 	break;
87       default:
88 	errno = EINVAL;
89 	return (size_t)(-1);
90       }
91 
92     /* Here m > 0.  */
93 
94 # if __GLIBC__
95     /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
96     mbtowc (NULL, NULL, 0);
97 # endif
98     {
99       int res = mbtowc (pwc, p, m);
100 
101       if (res >= 0)
102 	{
103 	  if (pwc != NULL && ((*pwc == 0) != (res == 0)))
104 	    abort ();
105 	  if (nstate >= (res > 0 ? res : 1))
106 	    abort ();
107 	  res -= nstate;
108 	  pstate[0] = 0;
109 	  return res;
110 	}
111 
112       /* mbtowc does not distinguish between invalid and incomplete multibyte
113 	 sequences.  But mbrtowc needs to make this distinction.
114 	 There are two possible approaches:
115 	   - Use iconv() and its return value.
116 	   - Use built-in knowledge about the possible encodings.
117 	 Given the low quality of implementation of iconv() on the systems that
118 	 lack mbrtowc(), we use the second approach.
119 	 The possible encodings are:
120 	   - 8-bit encodings,
121 	   - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
122 	   - UTF-8.
123 	 Use specialized code for each.  */
124       if (m >= 4 || m >= MB_CUR_MAX)
125 	goto invalid;
126       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
127       {
128 	const char *encoding = locale_charset ();
129 
130 	if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
131 	  {
132 	    /* Cf. unistr/u8-mblen.c.  */
133 	    unsigned char c = (unsigned char) p[0];
134 
135 	    if (c >= 0xc2)
136 	      {
137 		if (c < 0xe0)
138 		  {
139 		    if (m == 1)
140 		      goto incomplete;
141 		  }
142 		else if (c < 0xf0)
143 		  {
144 		    if (m == 1)
145 		      goto incomplete;
146 		    if (m == 2)
147 		      {
148 			unsigned char c2 = (unsigned char) p[1];
149 
150 			if ((c2 ^ 0x80) < 0x40
151 			    && (c >= 0xe1 || c2 >= 0xa0)
152 			    && (c != 0xed || c2 < 0xa0))
153 			  goto incomplete;
154 		      }
155 		  }
156 		else if (c <= 0xf4)
157 		  {
158 		    if (m == 1)
159 		      goto incomplete;
160 		    else /* m == 2 || m == 3 */
161 		      {
162 			unsigned char c2 = (unsigned char) p[1];
163 
164 			if ((c2 ^ 0x80) < 0x40
165 			    && (c >= 0xf1 || c2 >= 0x90)
166 			    && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
167 			  {
168 			    if (m == 2)
169 			      goto incomplete;
170 			    else /* m == 3 */
171 			      {
172 				unsigned char c3 = (unsigned char) p[2];
173 
174 				if ((c3 ^ 0x80) < 0x40)
175 				  goto incomplete;
176 			      }
177 			  }
178 		      }
179 		  }
180 	      }
181 	    goto invalid;
182 	  }
183 
184 	/* As a reference for this code, you can use the GNU libiconv
185 	   implementation.  Look for uses of the RET_TOOFEW macro.  */
186 
187 	if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
188 	  {
189 	    if (m == 1)
190 	      {
191 		unsigned char c = (unsigned char) p[0];
192 
193 		if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
194 		  goto incomplete;
195 	      }
196 	    if (m == 2)
197 	      {
198 		unsigned char c = (unsigned char) p[0];
199 
200 		if (c == 0x8f)
201 		  {
202 		    unsigned char c2 = (unsigned char) p[1];
203 
204 		    if (c2 >= 0xa1 && c2 < 0xff)
205 		      goto incomplete;
206 		  }
207 	      }
208 	    goto invalid;
209 	  }
210 	if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
211 	    || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
212 	    || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
213 	  {
214 	    if (m == 1)
215 	      {
216 		unsigned char c = (unsigned char) p[0];
217 
218 		if (c >= 0xa1 && c < 0xff)
219 		  goto incomplete;
220 	      }
221 	    goto invalid;
222 	  }
223 	if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
224 	  {
225 	    if (m == 1)
226 	      {
227 		unsigned char c = (unsigned char) p[0];
228 
229 		if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
230 		  goto incomplete;
231 	      }
232 	    else /* m == 2 || m == 3 */
233 	      {
234 		unsigned char c = (unsigned char) p[0];
235 
236 		if (c == 0x8e)
237 		  goto incomplete;
238 	      }
239 	    goto invalid;
240 	  }
241 	if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
242 	  {
243 	    if (m == 1)
244 	      {
245 		unsigned char c = (unsigned char) p[0];
246 
247 		if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
248 		  goto incomplete;
249 	      }
250 	    else /* m == 2 || m == 3 */
251 	      {
252 		unsigned char c = (unsigned char) p[0];
253 
254 		if (c >= 0x90 && c <= 0xe3)
255 		  {
256 		    unsigned char c2 = (unsigned char) p[1];
257 
258 		    if (c2 >= 0x30 && c2 <= 0x39)
259 		      {
260 			if (m == 2)
261 			  goto incomplete;
262 			else /* m == 3 */
263 			  {
264 			    unsigned char c3 = (unsigned char) p[2];
265 
266 			    if (c3 >= 0x81 && c3 <= 0xfe)
267 			      goto incomplete;
268 			  }
269 		      }
270 		  }
271 	      }
272 	    goto invalid;
273 	  }
274 	if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
275 	  {
276 	    if (m == 1)
277 	      {
278 		unsigned char c = (unsigned char) p[0];
279 
280 		if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
281 		    || (c >= 0xf0 && c <= 0xf9))
282 		  goto incomplete;
283 	      }
284 	    goto invalid;
285 	  }
286 
287 	/* An unknown multibyte encoding.  */
288 	goto incomplete;
289       }
290 
291      incomplete:
292       {
293 	size_t k = nstate;
294 	/* Here 0 <= k < m < 4.  */
295 	pstate[++k] = s[0];
296 	if (k < m)
297 	  {
298 	    pstate[++k] = s[1];
299 	    if (k < m)
300 	      pstate[++k] = s[2];
301 	  }
302 	if (k != m)
303 	  abort ();
304       }
305       pstate[0] = m;
306       return (size_t)(-2);
307 
308      invalid:
309       errno = EILSEQ;
310       /* The conversion state is undefined, says POSIX.  */
311       return (size_t)(-1);
312     }
313   }
314 }
315 
316 #else
317 /* Override the system's mbrtowc() function.  */
318 
319 # undef mbrtowc
320 
321 size_t
rpl_mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)322 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
323 {
324 # if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG
325   if (s == NULL)
326     {
327       pwc = NULL;
328       s = "";
329       n = 1;
330     }
331 # endif
332 
333 # if MBRTOWC_RETVAL_BUG
334   {
335     static mbstate_t internal_state;
336 
337     /* Override mbrtowc's internal state.  We can not call mbsinit() on the
338        hidden internal state, but we can call it on our variable.  */
339     if (ps == NULL)
340       ps = &internal_state;
341 
342     if (!mbsinit (ps))
343       {
344 	/* Parse the rest of the multibyte character byte for byte.  */
345 	size_t count = 0;
346 	for (; n > 0; s++, n--)
347 	  {
348 	    wchar_t wc;
349 	    size_t ret = mbrtowc (&wc, s, 1, ps);
350 
351 	    if (ret == (size_t)(-1))
352 	      return (size_t)(-1);
353 	    count++;
354 	    if (ret != (size_t)(-2))
355 	      {
356 		/* The multibyte character has been completed.  */
357 		if (pwc != NULL)
358 		  *pwc = wc;
359 		return (wc == 0 ? 0 : count);
360 	      }
361 	  }
362 	return (size_t)(-2);
363       }
364   }
365 # endif
366 
367 # if MBRTOWC_NUL_RETVAL_BUG
368   {
369     wchar_t wc;
370     size_t ret = mbrtowc (&wc, s, n, ps);
371 
372     if (ret != (size_t)(-1) && ret != (size_t)(-2))
373       {
374 	if (pwc != NULL)
375 	  *pwc = wc;
376 	if (wc == 0)
377 	  ret = 0;
378       }
379     return ret;
380   }
381 # else
382   return mbrtowc (pwc, s, n, ps);
383 # endif
384 }
385 
386 #endif
387