• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*******************************************************************************
2  * Copyright (c) 2009, 2018 IBM Corp.
3  *
4  * All rights reserved. This program and the accompanying materials
5  * are made available under the terms of the Eclipse Public License v2.0
6  * and Eclipse Distribution License v1.0 which accompany this distribution.
7  *
8  * The Eclipse Public License is available at
9  *    https://www.eclipse.org/legal/epl-2.0/
10  * and the Eclipse Distribution License is available at
11  *   http://www.eclipse.org/org/documents/edl-v10.php.
12  *
13  * Contributors:
14  *    Ian Craggs - initial API and implementation and/or initial documentation
15  *******************************************************************************/
16 
17 
18 /**
19  * @file
20  * \brief Functions for checking that strings contain UTF-8 characters only
21  *
22  * See page 104 of the Unicode Standard 5.0 for the list of well formed
23  * UTF-8 byte sequences.
24  *
25  */
26 #include "utf-8.h"
27 
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #include "StackTrace.h"
32 
33 /**
34  * Macro to determine the number of elements in a single-dimension array
35  */
36 #if !defined(ARRAY_SIZE)
37 #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
38 #endif
39 
40 
41 /**
42  * Structure to hold the valid ranges of UTF-8 characters, for each byte up to 4
43  */
44 struct
45 {
46 	int len; /**< number of elements in the following array (1 to 4) */
47 	struct
48 	{
49 		char lower; /**< lower limit of valid range */
50 		char upper; /**< upper limit of valid range */
51 	} bytes[4];   /**< up to 4 bytes can be used per character */
52 }
53 valid_ranges[] =
54 {
55 		{1, { {00, 0x7F} } },
56 		{2, { {0xC2, 0xDF}, {0x80, 0xBF} } },
57 		{3, { {0xE0, 0xE0}, {0xA0, 0xBF}, {0x80, 0xBF} } },
58 		{3, { {0xE1, 0xEC}, {0x80, 0xBF}, {0x80, 0xBF} } },
59 		{3, { {0xED, 0xED}, {0x80, 0x9F}, {0x80, 0xBF} } },
60 		{3, { {0xEE, 0xEF}, {0x80, 0xBF}, {0x80, 0xBF} } },
61 		{4, { {0xF0, 0xF0}, {0x90, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF} } },
62 		{4, { {0xF1, 0xF3}, {0x80, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF} } },
63 		{4, { {0xF4, 0xF4}, {0x80, 0x8F}, {0x80, 0xBF}, {0x80, 0xBF} } },
64 };
65 
66 
67 static const char* UTF8_char_validate(int len, const char* data);
68 
69 
70 /**
71  * Validate a single UTF-8 character
72  * @param len the length of the string in "data"
73  * @param data the bytes to check for a valid UTF-8 char
74  * @return pointer to the start of the next UTF-8 character in "data"
75  */
UTF8_char_validate(int len,const char * data)76 static const char* UTF8_char_validate(int len, const char* data)
77 {
78 	int good = 0;
79 	int charlen = 2;
80 	int i, j;
81 	const char *rc = NULL;
82 
83 	if (data == NULL)
84 		goto exit;	/* don't have data, can't continue */
85 
86 	/* first work out how many bytes this char is encoded in */
87 	if ((data[0] & 128) == 0)
88 		charlen = 1;
89 	else if ((data[0] & 0xF0) == 0xF0)
90 		charlen = 4;
91 	else if ((data[0] & 0xE0) == 0xE0)
92 		charlen = 3;
93 
94 	if (charlen > len)
95 		goto exit;	/* not enough characters in the string we were given */
96 
97 	for (i = 0; i < ARRAY_SIZE(valid_ranges); ++i)
98 	{ /* just has to match one of these rows */
99 		if (valid_ranges[i].len == charlen)
100 		{
101 			good = 1;
102 			for (j = 0; j < charlen; ++j)
103 			{
104 				if (data[j] < valid_ranges[i].bytes[j].lower ||
105 						data[j] > valid_ranges[i].bytes[j].upper)
106 				{
107 					good = 0;  /* failed the check */
108 					break;
109 				}
110 			}
111 			if (good)
112 				break;
113 		}
114 	}
115 
116 	if (good)
117 		rc = data + charlen;
118 	exit:
119 	return rc;
120 }
121 
122 
123 /**
124  * Validate a length-delimited string has only UTF-8 characters
125  * @param len the length of the string in "data"
126  * @param data the bytes to check for valid UTF-8 characters
127  * @return 1 (true) if the string has only UTF-8 characters, 0 (false) otherwise
128  */
UTF8_validate(int len,const char * data)129 int UTF8_validate(int len, const char* data)
130 {
131 	const char* curdata = NULL;
132 	int rc = 0;
133 
134 	FUNC_ENTRY;
135 	if (len == 0 || data == NULL)
136 	{
137 		rc = 1;
138 		goto exit;
139 	}
140 	curdata = UTF8_char_validate(len, data);
141 	while (curdata && (curdata < data + len))
142 		curdata = UTF8_char_validate((int)(data + len - curdata), curdata);
143 
144 	rc = curdata != NULL;
145 exit:
146 	FUNC_EXIT_RC(rc);
147 	return rc;
148 }
149 
150 
151 /**
152  * Validate a null-terminated string has only UTF-8 characters
153  * @param string the string to check for valid UTF-8 characters
154  * @return 1 (true) if the string has only UTF-8 characters, 0 (false) otherwise
155  */
UTF8_validateString(const char * string)156 int UTF8_validateString(const char* string)
157 {
158 	int rc = 0;
159 
160 	FUNC_ENTRY;
161 	if (string != NULL)
162 	{
163 		rc = UTF8_validate((int)strlen(string), string);
164 	}
165 	FUNC_EXIT_RC(rc);
166 	return rc;
167 }
168 
169 
170 
171 #if defined(UNIT_TESTS)
172 #include <stdio.h>
173 
174 typedef struct
175 {
176 	int len;
177 	char data[20];
178 } tests;
179 
180 tests valid_strings[] =
181 {
182 		{3, "hjk" },
183 		{7, {0x41, 0xE2, 0x89, 0xA2, 0xCE, 0x91, 0x2E} },
184 		{3, {'f', 0xC9, 0xB1 } },
185 		{9, {0xED, 0x95, 0x9C, 0xEA, 0xB5, 0xAD, 0xEC, 0x96, 0xB4} },
186 		{9, {0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E} },
187 		{4, {0x2F, 0x2E, 0x2E, 0x2F} },
188 		{7, {0xEF, 0xBB, 0xBF, 0xF0, 0xA3, 0x8E, 0xB4} },
189 };
190 
191 tests invalid_strings[] =
192 {
193 		{2, {0xC0, 0x80} },
194 		{5, {0x2F, 0xC0, 0xAE, 0x2E, 0x2F} },
195 		{6, {0xED, 0xA1, 0x8C, 0xED, 0xBE, 0xB4} },
196 		{1, {0xF4} },
197 };
198 
main(int argc,char * argv[])199 int main (int argc, char *argv[])
200 {
201 	int i, failed = 0;
202 
203 	for (i = 0; i < ARRAY_SIZE(valid_strings); ++i)
204 	{
205 		if (!UTF8_validate(valid_strings[i].len, valid_strings[i].data))
206 		{
207 			printf("valid test %d failed\n", i);
208 			failed = 1;
209 		}
210 		else
211 			printf("valid test %d passed\n", i);
212 	}
213 
214 	for (i = 0; i < ARRAY_SIZE(invalid_strings); ++i)
215 	{
216 		if (UTF8_validate(invalid_strings[i].len, invalid_strings[i].data))
217 		{
218 			printf("invalid test %d failed\n", i);
219 			failed = 1;
220 		}
221 		else
222 			printf("invalid test %d passed\n", i);
223 	}
224 
225 	if (failed)
226 		printf("Failed\n");
227 	else
228 		printf("Passed\n");
229 
230     //Don't crash on null data
231 	UTF8_validateString(NULL);
232 	UTF8_validate(1, NULL);
233 	UTF8_char_validate(1, NULL);
234 
235 	return 0;
236 } /* End of main function*/
237 
238 #endif
239 
240