1 /* Copyright 2016 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6 #include <stdlib.h>
7 #include <stdint.h>
8 #include <sys/types.h>
9
10 #ifdef CRAS_DBUS
11 #include <dbus/dbus.h>
12 #endif
13
14 #include "cras_utf8.h"
15 #include "cras_util.h"
16
17 static const uint8_t kUTF8ByteOrderMask[3] = { 0xef, 0xbb, 0xbf };
18
19 typedef struct u8range {
20 uint8_t min;
21 uint8_t max;
22 } u8range_t;
23
24 static const u8range_t kUTF8TwoByteSeq[] = {
25 { 0xc2, 0xdf },
26 { 0x80, 0xbf },
27 { 0, 0 },
28 };
29
30 static const u8range_t kUTF8ByteSeqE0[] = {
31 { 0xe0, 0xe0 },
32 { 0xa0, 0xbf },
33 { 0x80, 0xbf },
34 { 0, 0 },
35 };
36
37 static const u8range_t kUTF8ByteSeqE1EC[] = {
38 { 0xe1, 0xec },
39 { 0x80, 0xbf },
40 { 0x80, 0xbf },
41 { 0, 0 },
42 };
43
44 static const u8range_t kUTF8ByteSeqED[] = {
45 { 0xed, 0xed },
46 { 0x80, 0x9f },
47 { 0x80, 0xbf },
48 { 0, 0 },
49 };
50
51 static const u8range_t kUTF8ByteSeqEEEF[] = {
52 { 0xee, 0xef },
53 { 0x80, 0xbf },
54 { 0x80, 0xbf },
55 { 0, 0 },
56 };
57
58 static const u8range_t kUTF8ByteSeqF0[] = {
59 { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf },
60 { 0x80, 0xbf }, { 0, 0 },
61 };
62
63 static const u8range_t kUTF8ByteSeqF1F3[] = {
64 { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf },
65 { 0x80, 0xbf }, { 0, 0 },
66 };
67
68 static const u8range_t kUTF8ByteSeqF4[] = {
69 { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf },
70 { 0x80, 0xbf }, { 0, 0 },
71 };
72
73 static const u8range_t kUTF8NullRange[] = { { 0, 0 } };
74
75 typedef struct utf8seq {
76 const u8range_t *ranges;
77 } utf8seq_t;
78
79 static const utf8seq_t kUTF8Sequences[] = {
80 { kUTF8TwoByteSeq }, { kUTF8ByteSeqE0 }, { kUTF8ByteSeqE1EC },
81 { kUTF8ByteSeqED }, { kUTF8ByteSeqEEEF }, { kUTF8ByteSeqF0 },
82 { kUTF8ByteSeqF1F3 }, { kUTF8ByteSeqF4 }, { kUTF8NullRange }
83 };
84
valid_utf8_string(const char * string,size_t * bad_pos)85 int valid_utf8_string(const char *string, size_t *bad_pos)
86 {
87 int bom_chars = 0;
88 uint8_t byte;
89 const char *pos = string;
90 int ret = 1;
91 const utf8seq_t *seq = NULL;
92 const u8range_t *range = NULL;
93
94 if (!pos) {
95 ret = 0;
96 goto error;
97 }
98
99 while ((byte = (uint8_t) * (pos++))) {
100 if (!range || range->min == 0) {
101 if (byte < 128) {
102 /* Ascii character. */
103 continue;
104 }
105
106 if (bom_chars < ARRAY_SIZE(kUTF8ByteOrderMask)) {
107 if (byte == kUTF8ByteOrderMask[bom_chars]) {
108 bom_chars++;
109 continue;
110 } else {
111 /* Characters not matching BOM.
112 * Rewind and assume that there is
113 * no BOM. */
114 bom_chars =
115 ARRAY_SIZE(kUTF8ByteOrderMask);
116 pos = string;
117 continue;
118 }
119 }
120
121 /* Find the matching sequence of characters by
122 * matching the first character in the sequence.
123 */
124 seq = kUTF8Sequences;
125 while (seq->ranges->min != 0) {
126 if (byte >= seq->ranges->min &&
127 byte <= seq->ranges->max) {
128 /* Matching sequence. */
129 break;
130 }
131 seq++;
132 }
133
134 if (seq->ranges->min == 0) {
135 /* Could not find a matching sequence. */
136 ret = 0;
137 goto error;
138 }
139
140 /* Found the appropriate sequence. */
141 range = seq->ranges + 1;
142 continue;
143 }
144
145 if (byte >= range->min && byte <= range->max) {
146 range++;
147 continue;
148 }
149
150 /* This character doesn't belong in UTF8. */
151 ret = 0;
152 goto error;
153 }
154
155 if (range && range->min != 0) {
156 /* Stopped in the middle of a sequence. */
157 ret = 0;
158 }
159
160 error:
161 if (bad_pos)
162 *bad_pos = pos - string - 1;
163 return ret;
164 }
165
166 #ifdef CRAS_DBUS
167 /* Use the DBus implementation if available to ensure that the UTF-8
168 * sequences match those expected by the DBus implementation. */
169
is_utf8_string(const char * string)170 int is_utf8_string(const char *string)
171 {
172 return !!dbus_validate_utf8(string, NULL);
173 }
174
175 #else
176
is_utf8_string(const char * string)177 int is_utf8_string(const char *string)
178 {
179 return valid_utf8_string(string, NULL);
180 }
181
182 #endif
183