1 /*******************************************************************************
2 * Copyright (c) 2009, 2018 IBM Corp.
3 *
4 * All rights reserved. This program and the accompanying materials
5 * are made available under the terms of the Eclipse Public License v2.0
6 * and Eclipse Distribution License v1.0 which accompany this distribution.
7 *
8 * The Eclipse Public License is available at
9 * https://www.eclipse.org/legal/epl-2.0/
10 * and the Eclipse Distribution License is available at
11 * http://www.eclipse.org/org/documents/edl-v10.php.
12 *
13 * Contributors:
14 * Ian Craggs - initial API and implementation and/or initial documentation
15 *******************************************************************************/
16
17
18 /**
19 * @file
20 * \brief Functions for checking that strings contain UTF-8 characters only
21 *
22 * See page 104 of the Unicode Standard 5.0 for the list of well formed
23 * UTF-8 byte sequences.
24 *
25 */
26 #include "utf-8.h"
27
28 #include <stdlib.h>
29 #include <string.h>
30
31 #include "StackTrace.h"
32
33 /**
34 * Macro to determine the number of elements in a single-dimension array
35 */
36 #if !defined(ARRAY_SIZE)
37 #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
38 #endif
39
40
41 /**
42 * Structure to hold the valid ranges of UTF-8 characters, for each byte up to 4
43 */
44 struct
45 {
46 int len; /**< number of elements in the following array (1 to 4) */
47 struct
48 {
49 char lower; /**< lower limit of valid range */
50 char upper; /**< upper limit of valid range */
51 } bytes[4]; /**< up to 4 bytes can be used per character */
52 }
53 valid_ranges[] =
54 {
55 {1, { {00, 0x7F} } },
56 {2, { {0xC2, 0xDF}, {0x80, 0xBF} } },
57 {3, { {0xE0, 0xE0}, {0xA0, 0xBF}, {0x80, 0xBF} } },
58 {3, { {0xE1, 0xEC}, {0x80, 0xBF}, {0x80, 0xBF} } },
59 {3, { {0xED, 0xED}, {0x80, 0x9F}, {0x80, 0xBF} } },
60 {3, { {0xEE, 0xEF}, {0x80, 0xBF}, {0x80, 0xBF} } },
61 {4, { {0xF0, 0xF0}, {0x90, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF} } },
62 {4, { {0xF1, 0xF3}, {0x80, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF} } },
63 {4, { {0xF4, 0xF4}, {0x80, 0x8F}, {0x80, 0xBF}, {0x80, 0xBF} } },
64 };
65
66
67 static const char* UTF8_char_validate(int len, const char* data);
68
69
70 /**
71 * Validate a single UTF-8 character
72 * @param len the length of the string in "data"
73 * @param data the bytes to check for a valid UTF-8 char
74 * @return pointer to the start of the next UTF-8 character in "data"
75 */
UTF8_char_validate(int len,const char * data)76 static const char* UTF8_char_validate(int len, const char* data)
77 {
78 int good = 0;
79 int charlen = 2;
80 int i, j;
81 const char *rc = NULL;
82
83 if (data == NULL)
84 goto exit; /* don't have data, can't continue */
85
86 /* first work out how many bytes this char is encoded in */
87 if ((data[0] & 128) == 0)
88 charlen = 1;
89 else if ((data[0] & 0xF0) == 0xF0)
90 charlen = 4;
91 else if ((data[0] & 0xE0) == 0xE0)
92 charlen = 3;
93
94 if (charlen > len)
95 goto exit; /* not enough characters in the string we were given */
96
97 for (i = 0; i < ARRAY_SIZE(valid_ranges); ++i)
98 { /* just has to match one of these rows */
99 if (valid_ranges[i].len == charlen)
100 {
101 good = 1;
102 for (j = 0; j < charlen; ++j)
103 {
104 if (data[j] < valid_ranges[i].bytes[j].lower ||
105 data[j] > valid_ranges[i].bytes[j].upper)
106 {
107 good = 0; /* failed the check */
108 break;
109 }
110 }
111 if (good)
112 break;
113 }
114 }
115
116 if (good)
117 rc = data + charlen;
118 exit:
119 return rc;
120 }
121
122
123 /**
124 * Validate a length-delimited string has only UTF-8 characters
125 * @param len the length of the string in "data"
126 * @param data the bytes to check for valid UTF-8 characters
127 * @return 1 (true) if the string has only UTF-8 characters, 0 (false) otherwise
128 */
UTF8_validate(int len,const char * data)129 int UTF8_validate(int len, const char* data)
130 {
131 const char* curdata = NULL;
132 int rc = 0;
133
134 FUNC_ENTRY;
135 if (len == 0 || data == NULL)
136 {
137 rc = 1;
138 goto exit;
139 }
140 curdata = UTF8_char_validate(len, data);
141 while (curdata && (curdata < data + len))
142 curdata = UTF8_char_validate((int)(data + len - curdata), curdata);
143
144 rc = curdata != NULL;
145 exit:
146 FUNC_EXIT_RC(rc);
147 return rc;
148 }
149
150
151 /**
152 * Validate a null-terminated string has only UTF-8 characters
153 * @param string the string to check for valid UTF-8 characters
154 * @return 1 (true) if the string has only UTF-8 characters, 0 (false) otherwise
155 */
UTF8_validateString(const char * string)156 int UTF8_validateString(const char* string)
157 {
158 int rc = 0;
159
160 FUNC_ENTRY;
161 if (string != NULL)
162 {
163 rc = UTF8_validate((int)strlen(string), string);
164 }
165 FUNC_EXIT_RC(rc);
166 return rc;
167 }
168
169
170
171 #if defined(UNIT_TESTS)
172 #include <stdio.h>
173
174 typedef struct
175 {
176 int len;
177 char data[20];
178 } tests;
179
180 tests valid_strings[] =
181 {
182 {3, "hjk" },
183 {7, {0x41, 0xE2, 0x89, 0xA2, 0xCE, 0x91, 0x2E} },
184 {3, {'f', 0xC9, 0xB1 } },
185 {9, {0xED, 0x95, 0x9C, 0xEA, 0xB5, 0xAD, 0xEC, 0x96, 0xB4} },
186 {9, {0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E} },
187 {4, {0x2F, 0x2E, 0x2E, 0x2F} },
188 {7, {0xEF, 0xBB, 0xBF, 0xF0, 0xA3, 0x8E, 0xB4} },
189 };
190
191 tests invalid_strings[] =
192 {
193 {2, {0xC0, 0x80} },
194 {5, {0x2F, 0xC0, 0xAE, 0x2E, 0x2F} },
195 {6, {0xED, 0xA1, 0x8C, 0xED, 0xBE, 0xB4} },
196 {1, {0xF4} },
197 };
198
main(int argc,char * argv[])199 int main (int argc, char *argv[])
200 {
201 int i, failed = 0;
202
203 for (i = 0; i < ARRAY_SIZE(valid_strings); ++i)
204 {
205 if (!UTF8_validate(valid_strings[i].len, valid_strings[i].data))
206 {
207 printf("valid test %d failed\n", i);
208 failed = 1;
209 }
210 else
211 printf("valid test %d passed\n", i);
212 }
213
214 for (i = 0; i < ARRAY_SIZE(invalid_strings); ++i)
215 {
216 if (UTF8_validate(invalid_strings[i].len, invalid_strings[i].data))
217 {
218 printf("invalid test %d failed\n", i);
219 failed = 1;
220 }
221 else
222 printf("invalid test %d passed\n", i);
223 }
224
225 if (failed)
226 printf("Failed\n");
227 else
228 printf("Passed\n");
229
230 //Don't crash on null data
231 UTF8_validateString(NULL);
232 UTF8_validate(1, NULL);
233 UTF8_char_validate(1, NULL);
234
235 return 0;
236 } /* End of main function*/
237
238 #endif
239
240