1 /* $OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */
2
3 /*-
4 * Copyright (c) 2002-2004 Tim J. Robbins
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <errno.h>
30 #include <string.h>
31 #include <sys/param.h>
32 #include <uchar.h>
33 #include <wchar.h>
34
35 #include "private/bionic_mbstate.h"
36
37 //
38 // This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a
39 // 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where
40 // mbstate_t was only 4 bytes.
41 //
42 // The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32
43 // mbstate_t already has enough space (out of the 4 available bytes we only
44 // need 3 since we should never need to store the entire sequence in the
45 // intermediary state).
46 //
47 // The C standard leaves the conversion state undefined after a bad conversion.
48 // To avoid unexpected failures due to the possible use of the internal private
49 // state we always reset the conversion state when encountering illegal
50 // sequences.
51 //
52 // We also implement the POSIX interface directly rather than being accessed via
53 // function pointers.
54 //
55
mbsinit(const mbstate_t * ps)56 int mbsinit(const mbstate_t* ps) {
57 return ps == nullptr || mbstate_is_initial(ps);
58 }
59
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)60 size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) {
61 static mbstate_t __private_state;
62 mbstate_t* state = (ps == nullptr) ? &__private_state : ps;
63
64 // Our wchar_t is UTF-32.
65 return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state);
66 }
67
mbsnrtowcs(wchar_t * dst,const char ** src,size_t nmc,size_t len,mbstate_t * ps)68 size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) {
69 static mbstate_t __private_state;
70 mbstate_t* state = (ps == nullptr) ? &__private_state : ps;
71 size_t i, o, r;
72
73 // The fast paths in the loops below are not safe if an ASCII
74 // character appears as anything but the first byte of a
75 // multibyte sequence. Check now to avoid doing it in the loops.
76 if (nmc > 0 && mbstate_bytes_so_far(state) > 0 && static_cast<uint8_t>((*src)[0]) < 0x80) {
77 return mbstate_reset_and_return_illegal(EILSEQ, state);
78 }
79
80 // Measure only?
81 if (dst == nullptr) {
82 for (i = o = 0; i < nmc; i += r, o++) {
83 if (static_cast<uint8_t>((*src)[i]) < 0x80) {
84 // Fast path for plain ASCII characters.
85 if ((*src)[i] == '\0') {
86 return mbstate_reset_and_return(o, state);
87 }
88 r = 1;
89 } else {
90 r = mbrtowc(nullptr, *src + i, nmc - i, state);
91 if (r == BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE) {
92 return mbstate_reset_and_return_illegal(EILSEQ, state);
93 }
94 if (r == BIONIC_MULTIBYTE_RESULT_INCOMPLETE_SEQUENCE) {
95 return mbstate_reset_and_return_illegal(EILSEQ, state);
96 }
97 if (r == 0) {
98 return mbstate_reset_and_return(o, state);
99 }
100 }
101 }
102 return mbstate_reset_and_return(o, state);
103 }
104
105 // Actually convert, updating `dst` and `src`.
106 for (i = o = 0; i < nmc && o < len; i += r, o++) {
107 if (static_cast<uint8_t>((*src)[i]) < 0x80) {
108 // Fast path for plain ASCII characters.
109 dst[o] = (*src)[i];
110 r = 1;
111 if ((*src)[i] == '\0') {
112 *src = nullptr;
113 return mbstate_reset_and_return(o, state);
114 }
115 } else {
116 r = mbrtowc(dst + o, *src + i, nmc - i, state);
117 if (r == BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE) {
118 *src += i;
119 return mbstate_reset_and_return_illegal(EILSEQ, state);
120 }
121 if (r == BIONIC_MULTIBYTE_RESULT_INCOMPLETE_SEQUENCE) {
122 *src += nmc;
123 return mbstate_reset_and_return_illegal(EILSEQ, state);
124 }
125 if (r == 0) {
126 *src = nullptr;
127 return mbstate_reset_and_return(o, state);
128 }
129 }
130 }
131 *src += i;
132 return mbstate_reset_and_return(o, state);
133 }
134
mbsrtowcs(wchar_t * dst,const char ** src,size_t len,mbstate_t * ps)135 size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) {
136 return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
137 }
138 __strong_alias(mbsrtowcs_l, mbsrtowcs);
139
wcrtomb(char * s,wchar_t wc,mbstate_t * ps)140 size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) {
141 static mbstate_t __private_state;
142 mbstate_t* state = (ps == nullptr) ? &__private_state : ps;
143
144 // Our wchar_t is UTF-32.
145 return c32rtomb(s, static_cast<char32_t>(wc), state);
146 }
147
wcsnrtombs(char * dst,const wchar_t ** src,size_t nwc,size_t len,mbstate_t * ps)148 size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) {
149 static mbstate_t __private_state;
150 mbstate_t* state = (ps == nullptr) ? &__private_state : ps;
151
152 if (!mbstate_is_initial(state)) {
153 return mbstate_reset_and_return_illegal(EILSEQ, state);
154 }
155
156 char buf[MB_LEN_MAX];
157 size_t i, o, r;
158 if (dst == nullptr) {
159 for (i = o = 0; i < nwc; i++, o += r) {
160 wchar_t wc = (*src)[i];
161 if (static_cast<uint32_t>(wc) < 0x80) {
162 // Fast path for plain ASCII characters.
163 if (wc == 0) {
164 return o;
165 }
166 r = 1;
167 } else {
168 r = wcrtomb(buf, wc, state);
169 if (r == BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE) {
170 return r;
171 }
172 }
173 }
174 return o;
175 }
176
177 for (i = o = 0; i < nwc && o < len; i++, o += r) {
178 wchar_t wc = (*src)[i];
179 if (static_cast<uint32_t>(wc) < 0x80) {
180 // Fast path for plain ASCII characters.
181 dst[o] = wc;
182 if (wc == 0) {
183 *src = nullptr;
184 return o;
185 }
186 r = 1;
187 } else if (len - o >= sizeof(buf)) {
188 // Enough space to translate in-place.
189 r = wcrtomb(dst + o, wc, state);
190 if (r == BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE) {
191 *src += i;
192 return r;
193 }
194 } else {
195 // May not be enough space; use temp buffer.
196 r = wcrtomb(buf, wc, state);
197 if (r == BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE) {
198 *src += i;
199 return r;
200 }
201 if (r > len - o) {
202 break;
203 }
204 memcpy(dst + o, buf, r);
205 }
206 }
207 *src += i;
208 return o;
209 }
210
wcsrtombs(char * dst,const wchar_t ** src,size_t len,mbstate_t * ps)211 size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) {
212 return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
213 }
214 __strong_alias(wcsrtombs_l, wcsrtombs);
215