1 /* Copyright 2016 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6 #include <stdlib.h>
7 #include <stdint.h>
8 #include <sys/types.h>
9
10 #ifdef CRAS_DBUS
11 #include <dbus/dbus.h>
12 #endif
13
14 #include "cras_utf8.h"
15 #include "cras_util.h"
16
17 static const uint8_t kUTF8ByteOrderMask[3] = { 0xef, 0xbb, 0xbf };
18
19 typedef struct u8range {
20 uint8_t min;
21 uint8_t max;
22 } u8range_t;
23
24 static const u8range_t kUTF8TwoByteSeq[] = {
25 { 0xc2, 0xdf },
26 { 0x80, 0xbf },
27 { 0, 0 }
28 };
29
30 static const u8range_t kUTF8ByteSeqE0[] = {
31 { 0xe0, 0xe0 },
32 { 0xa0, 0xbf },
33 { 0x80, 0xbf },
34 { 0, 0 }
35 };
36
37 static const u8range_t kUTF8ByteSeqE1EC[] = {
38 { 0xe1, 0xec },
39 { 0x80, 0xbf },
40 { 0x80, 0xbf },
41 { 0, 0 }
42 };
43
44 static const u8range_t kUTF8ByteSeqED[] = {
45 { 0xed, 0xed },
46 { 0x80, 0x9f },
47 { 0x80, 0xbf },
48 { 0, 0 }
49 };
50
51 static const u8range_t kUTF8ByteSeqEEEF[] = {
52 { 0xee, 0xef },
53 { 0x80, 0xbf },
54 { 0x80, 0xbf },
55 { 0, 0 }
56 };
57
58 static const u8range_t kUTF8ByteSeqF0[] = {
59 { 0xf0, 0xf0 },
60 { 0x90, 0xbf },
61 { 0x80, 0xbf },
62 { 0x80, 0xbf },
63 { 0, 0 }
64 };
65
66 static const u8range_t kUTF8ByteSeqF1F3[] = {
67 { 0xf1, 0xf3 },
68 { 0x80, 0xbf },
69 { 0x80, 0xbf },
70 { 0x80, 0xbf },
71 { 0, 0 }
72 };
73
74 static const u8range_t kUTF8ByteSeqF4[] = {
75 { 0xf4, 0xf4 },
76 { 0x80, 0x8f },
77 { 0x80, 0xbf },
78 { 0x80, 0xbf },
79 { 0, 0 }
80 };
81
82 static const u8range_t kUTF8NullRange[] = {
83 { 0, 0 }
84 };
85
86 typedef struct utf8seq {
87 const u8range_t *ranges;
88 } utf8seq_t;
89
90 static const utf8seq_t kUTF8Sequences[] = {
91 { kUTF8TwoByteSeq },
92 { kUTF8ByteSeqE0 },
93 { kUTF8ByteSeqE1EC },
94 { kUTF8ByteSeqED },
95 { kUTF8ByteSeqEEEF },
96 { kUTF8ByteSeqF0 },
97 { kUTF8ByteSeqF1F3 },
98 { kUTF8ByteSeqF4 },
99 { kUTF8NullRange }
100 };
101
valid_utf8_string(const char * string,size_t * bad_pos)102 int valid_utf8_string(const char *string, size_t *bad_pos)
103 {
104 int bom_chars = 0;
105 uint8_t byte;
106 const char *pos = string;
107 int ret = 1;
108 const utf8seq_t *seq = NULL;
109 const u8range_t *range = NULL;
110
111 if (!pos) {
112 ret = 0;
113 goto error;
114 }
115
116 while ((byte = (uint8_t)*(pos++))) {
117 if (!range || range->min == 0) {
118 if (byte < 128) {
119 /* Ascii character. */
120 continue;
121 }
122
123 if (bom_chars < ARRAY_SIZE(kUTF8ByteOrderMask)) {
124 if (byte == kUTF8ByteOrderMask[bom_chars]) {
125 bom_chars++;
126 continue;
127 } else {
128 /* Characters not matching BOM.
129 * Rewind and assume that there is
130 * no BOM. */
131 bom_chars =
132 ARRAY_SIZE(kUTF8ByteOrderMask);
133 pos = string;
134 continue;
135 }
136 }
137
138 /* Find the matching sequence of characters by
139 * matching the first character in the sequence.
140 */
141 seq = kUTF8Sequences;
142 while (seq->ranges->min != 0) {
143 if (byte >= seq->ranges->min &&
144 byte <= seq->ranges->max) {
145 /* Matching sequence. */
146 break;
147 }
148 seq++;
149 }
150
151 if (seq->ranges->min == 0) {
152 /* Could not find a matching sequence. */
153 ret = 0;
154 goto error;
155 }
156
157 /* Found the appropriate sequence. */
158 range = seq->ranges + 1;
159 continue;
160 }
161
162 if (byte >= range->min && byte <= range->max) {
163 range++;
164 continue;
165 }
166
167 /* This character doesn't belong in UTF8. */
168 ret = 0;
169 goto error;
170 }
171
172 if (range && range->min != 0) {
173 /* Stopped in the middle of a sequence. */
174 ret = 0;
175 }
176
177 error:
178 if (bad_pos)
179 *bad_pos = pos - string - 1;
180 return ret;
181 }
182
183 #ifdef CRAS_DBUS
184 /* Use the DBus implementation if available to ensure that the UTF-8
185 * sequences match those expected by the DBus implementation. */
186
is_utf8_string(const char * string)187 int is_utf8_string(const char *string)
188 {
189 return !!dbus_validate_utf8(string, NULL);
190 }
191
192 #else
193
is_utf8_string(const char * string)194 int is_utf8_string (const char *string) {
195 return valid_utf8_string(string, NULL);
196 }
197
198 #endif
199