1 /*
2 * libjingle
3 * Copyright 2011, Google Inc.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
17 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
18 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
19 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
22 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
24 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
25 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "talk/base/stringencode.h"
29
30 #include <cstdio>
31 #include <cstdlib>
32
33 #include "talk/base/basictypes.h"
34 #include "talk/base/common.h"
35 #include "talk/base/stringutils.h"
36
37 namespace talk_base {
38
39 /////////////////////////////////////////////////////////////////////////////
40 // String Encoding Utilities
41 /////////////////////////////////////////////////////////////////////////////
42
43 static const char HEX[] = "0123456789abcdef";
44
hex_encode(unsigned char val)45 char hex_encode(unsigned char val) {
46 ASSERT(val < 16);
47 return (val < 16) ? HEX[val] : '!';
48 }
49
hex_decode(char ch,unsigned char * val)50 bool hex_decode(char ch, unsigned char* val) {
51 if ((ch >= '0') && (ch <= '9')) {
52 *val = ch - '0';
53 } else if ((ch >= 'A') && (ch <= 'Z')) {
54 *val = (ch - 'A') + 10;
55 } else if ((ch >= 'a') && (ch <= 'z')) {
56 *val = (ch - 'a') + 10;
57 } else {
58 return false;
59 }
60 return true;
61 }
62
escape(char * buffer,size_t buflen,const char * source,size_t srclen,const char * illegal,char escape)63 size_t escape(char * buffer, size_t buflen,
64 const char * source, size_t srclen,
65 const char * illegal, char escape) {
66 ASSERT(NULL != buffer); // TODO: estimate output size
67 if (buflen <= 0)
68 return 0;
69
70 size_t srcpos = 0, bufpos = 0;
71 while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
72 char ch = source[srcpos++];
73 if ((ch == escape) || ::strchr(illegal, ch)) {
74 if (bufpos + 2 >= buflen)
75 break;
76 buffer[bufpos++] = escape;
77 }
78 buffer[bufpos++] = ch;
79 }
80
81 buffer[bufpos] = '\0';
82 return bufpos;
83 }
84
unescape(char * buffer,size_t buflen,const char * source,size_t srclen,char escape)85 size_t unescape(char * buffer, size_t buflen,
86 const char * source, size_t srclen,
87 char escape) {
88 ASSERT(NULL != buffer); // TODO: estimate output size
89 if (buflen <= 0)
90 return 0;
91
92 size_t srcpos = 0, bufpos = 0;
93 while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
94 char ch = source[srcpos++];
95 if ((ch == escape) && (srcpos < srclen)) {
96 ch = source[srcpos++];
97 }
98 buffer[bufpos++] = ch;
99 }
100 buffer[bufpos] = '\0';
101 return bufpos;
102 }
103
encode(char * buffer,size_t buflen,const char * source,size_t srclen,const char * illegal,char escape)104 size_t encode(char * buffer, size_t buflen,
105 const char * source, size_t srclen,
106 const char * illegal, char escape) {
107 ASSERT(NULL != buffer); // TODO: estimate output size
108 if (buflen <= 0)
109 return 0;
110
111 size_t srcpos = 0, bufpos = 0;
112 while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
113 char ch = source[srcpos++];
114 if ((ch != escape) && !::strchr(illegal, ch)) {
115 buffer[bufpos++] = ch;
116 } else if (bufpos + 3 >= buflen) {
117 break;
118 } else {
119 buffer[bufpos+0] = escape;
120 buffer[bufpos+1] = hex_encode((static_cast<unsigned char>(ch) >> 4) & 0xF);
121 buffer[bufpos+2] = hex_encode((static_cast<unsigned char>(ch) ) & 0xF);
122 bufpos += 3;
123 }
124 }
125 buffer[bufpos] = '\0';
126 return bufpos;
127 }
128
decode(char * buffer,size_t buflen,const char * source,size_t srclen,char escape)129 size_t decode(char * buffer, size_t buflen,
130 const char * source, size_t srclen,
131 char escape) {
132 if (buflen <= 0)
133 return 0;
134
135 unsigned char h1, h2;
136 size_t srcpos = 0, bufpos = 0;
137 while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
138 char ch = source[srcpos++];
139 if ((ch == escape)
140 && (srcpos + 1 < srclen)
141 && hex_decode(source[srcpos], &h1)
142 && hex_decode(source[srcpos+1], &h2)) {
143 buffer[bufpos++] = (h1 << 4) | h2;
144 srcpos += 2;
145 } else {
146 buffer[bufpos++] = ch;
147 }
148 }
149 buffer[bufpos] = '\0';
150 return bufpos;
151 }
152
unsafe_filename_characters()153 const char* unsafe_filename_characters() {
154 // It might be better to have a single specification which is the union of
155 // all operating systems, unless one system is overly restrictive.
156 #ifdef WIN32
157 return "\\/:*?\"<>|";
158 #else // !WIN32
159 // TODO
160 ASSERT(false);
161 return "";
162 #endif // !WIN23
163 }
164
165 const unsigned char URL_UNSAFE = 0x1; // 0-33 "#$%&+,/:;<=>?@[\]^`{|} 127
166 const unsigned char XML_UNSAFE = 0x2; // "&'<>
167 const unsigned char HTML_UNSAFE = 0x2; // "&'<>
168
169 // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 6 5 7 8 9 : ; < = > ?
170 //@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
171 //` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
172
173 const unsigned char ASCII_CLASS[128] = {
174 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
175 1,0,3,1,1,1,3,2,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,3,1,3,1,
176 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,
177 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,
178 };
179
url_encode(char * buffer,size_t buflen,const char * source,size_t srclen)180 size_t url_encode(char * buffer, size_t buflen,
181 const char * source, size_t srclen) {
182 if (NULL == buffer)
183 return srclen * 3 + 1;
184 if (buflen <= 0)
185 return 0;
186
187 size_t srcpos = 0, bufpos = 0;
188 while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
189 unsigned char ch = source[srcpos++];
190 if ((ch < 128) && (ASCII_CLASS[ch] & URL_UNSAFE)) {
191 if (bufpos + 3 >= buflen) {
192 break;
193 }
194 buffer[bufpos+0] = '%';
195 buffer[bufpos+1] = hex_encode((ch >> 4) & 0xF);
196 buffer[bufpos+2] = hex_encode((ch ) & 0xF);
197 bufpos += 3;
198 } else {
199 buffer[bufpos++] = ch;
200 }
201 }
202 buffer[bufpos] = '\0';
203 return bufpos;
204 }
205
url_decode(char * buffer,size_t buflen,const char * source,size_t srclen)206 size_t url_decode(char * buffer, size_t buflen,
207 const char * source, size_t srclen) {
208 if (NULL == buffer)
209 return srclen + 1;
210 if (buflen <= 0)
211 return 0;
212
213 unsigned char h1, h2;
214 size_t srcpos = 0, bufpos = 0;
215 while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
216 unsigned char ch = source[srcpos++];
217 if (ch == '+') {
218 buffer[bufpos++] = ' ';
219 } else if ((ch == '%')
220 && (srcpos + 1 < srclen)
221 && hex_decode(source[srcpos], &h1)
222 && hex_decode(source[srcpos+1], &h2))
223 {
224 buffer[bufpos++] = (h1 << 4) | h2;
225 srcpos += 2;
226 } else {
227 buffer[bufpos++] = ch;
228 }
229 }
230 buffer[bufpos] = '\0';
231 return bufpos;
232 }
233
utf8_decode(const char * source,size_t srclen,unsigned long * value)234 size_t utf8_decode(const char* source, size_t srclen, unsigned long* value) {
235 const unsigned char* s = reinterpret_cast<const unsigned char*>(source);
236 if ((s[0] & 0x80) == 0x00) { // Check s[0] == 0xxxxxxx
237 *value = s[0];
238 return 1;
239 }
240 if ((srclen < 2) || ((s[1] & 0xC0) != 0x80)) { // Check s[1] != 10xxxxxx
241 return 0;
242 }
243 // Accumulate the trailer byte values in value16, and combine it with the
244 // relevant bits from s[0], once we've determined the sequence length.
245 unsigned long value16 = (s[1] & 0x3F);
246 if ((s[0] & 0xE0) == 0xC0) { // Check s[0] == 110xxxxx
247 *value = ((s[0] & 0x1F) << 6) | value16;
248 return 2;
249 }
250 if ((srclen < 3) || ((s[2] & 0xC0) != 0x80)) { // Check s[2] != 10xxxxxx
251 return 0;
252 }
253 value16 = (value16 << 6) | (s[2] & 0x3F);
254 if ((s[0] & 0xF0) == 0xE0) { // Check s[0] == 1110xxxx
255 *value = ((s[0] & 0x0F) << 12) | value16;
256 return 3;
257 }
258 if ((srclen < 4) || ((s[3] & 0xC0) != 0x80)) { // Check s[3] != 10xxxxxx
259 return 0;
260 }
261 value16 = (value16 << 6) | (s[3] & 0x3F);
262 if ((s[0] & 0xF8) == 0xF0) { // Check s[0] == 11110xxx
263 *value = ((s[0] & 0x07) << 18) | value16;
264 return 4;
265 }
266 return 0;
267 }
268
utf8_encode(char * buffer,size_t buflen,unsigned long value)269 size_t utf8_encode(char* buffer, size_t buflen, unsigned long value) {
270 if ((value <= 0x7F) && (buflen >= 1)) {
271 buffer[0] = static_cast<unsigned char>(value);
272 return 1;
273 }
274 if ((value <= 0x7FF) && (buflen >= 2)) {
275 buffer[0] = 0xC0 | static_cast<unsigned char>(value >> 6);
276 buffer[1] = 0x80 | static_cast<unsigned char>(value & 0x3F);
277 return 2;
278 }
279 if ((value <= 0xFFFF) && (buflen >= 3)) {
280 buffer[0] = 0xE0 | static_cast<unsigned char>(value >> 12);
281 buffer[1] = 0x80 | static_cast<unsigned char>((value >> 6) & 0x3F);
282 buffer[2] = 0x80 | static_cast<unsigned char>(value & 0x3F);
283 return 3;
284 }
285 if ((value <= 0x1FFFFF) && (buflen >= 4)) {
286 buffer[0] = 0xF0 | static_cast<unsigned char>(value >> 18);
287 buffer[1] = 0x80 | static_cast<unsigned char>((value >> 12) & 0x3F);
288 buffer[2] = 0x80 | static_cast<unsigned char>((value >> 6) & 0x3F);
289 buffer[3] = 0x80 | static_cast<unsigned char>(value & 0x3F);
290 return 4;
291 }
292 return 0;
293 }
294
html_encode(char * buffer,size_t buflen,const char * source,size_t srclen)295 size_t html_encode(char * buffer, size_t buflen,
296 const char * source, size_t srclen) {
297 ASSERT(NULL != buffer); // TODO: estimate output size
298 if (buflen <= 0)
299 return 0;
300
301 size_t srcpos = 0, bufpos = 0;
302 while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
303 unsigned char ch = source[srcpos];
304 if (ch < 128) {
305 srcpos += 1;
306 if (ASCII_CLASS[ch] & HTML_UNSAFE) {
307 const char * escseq = 0;
308 size_t esclen = 0;
309 switch (ch) {
310 case '<': escseq = "<"; esclen = 4; break;
311 case '>': escseq = ">"; esclen = 4; break;
312 case '\'': escseq = "'"; esclen = 5; break;
313 case '\"': escseq = """; esclen = 6; break;
314 case '&': escseq = "&"; esclen = 5; break;
315 default: ASSERT(false);
316 }
317 if (bufpos + esclen >= buflen) {
318 break;
319 }
320 memcpy(buffer + bufpos, escseq, esclen);
321 bufpos += esclen;
322 } else {
323 buffer[bufpos++] = ch;
324 }
325 } else {
326 // Largest value is 0x1FFFFF => � (10 characters)
327 char escseq[11];
328 unsigned long val;
329 if (size_t vallen = utf8_decode(&source[srcpos], srclen - srcpos, &val)) {
330 srcpos += vallen;
331 } else {
332 // Not a valid utf8 sequence, just use the raw character.
333 val = static_cast<unsigned char>(source[srcpos++]);
334 }
335 size_t esclen = sprintfn(escseq, ARRAY_SIZE(escseq), "&#%lu;", val);
336 if (bufpos + esclen >= buflen) {
337 break;
338 }
339 memcpy(buffer + bufpos, escseq, esclen);
340 bufpos += esclen;
341 }
342 }
343 buffer[bufpos] = '\0';
344 return bufpos;
345 }
346
html_decode(char * buffer,size_t buflen,const char * source,size_t srclen)347 size_t html_decode(char * buffer, size_t buflen,
348 const char * source, size_t srclen) {
349 ASSERT(NULL != buffer); // TODO: estimate output size
350 return xml_decode(buffer, buflen, source, srclen);
351 }
352
xml_encode(char * buffer,size_t buflen,const char * source,size_t srclen)353 size_t xml_encode(char * buffer, size_t buflen,
354 const char * source, size_t srclen) {
355 ASSERT(NULL != buffer); // TODO: estimate output size
356 if (buflen <= 0)
357 return 0;
358
359 size_t srcpos = 0, bufpos = 0;
360 while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
361 unsigned char ch = source[srcpos++];
362 if ((ch < 128) && (ASCII_CLASS[ch] & XML_UNSAFE)) {
363 const char * escseq = 0;
364 size_t esclen = 0;
365 switch (ch) {
366 case '<': escseq = "<"; esclen = 4; break;
367 case '>': escseq = ">"; esclen = 4; break;
368 case '\'': escseq = "'"; esclen = 6; break;
369 case '\"': escseq = """; esclen = 6; break;
370 case '&': escseq = "&"; esclen = 5; break;
371 default: ASSERT(false);
372 }
373 if (bufpos + esclen >= buflen) {
374 break;
375 }
376 memcpy(buffer + bufpos, escseq, esclen);
377 bufpos += esclen;
378 } else {
379 buffer[bufpos++] = ch;
380 }
381 }
382 buffer[bufpos] = '\0';
383 return bufpos;
384 }
385
xml_decode(char * buffer,size_t buflen,const char * source,size_t srclen)386 size_t xml_decode(char * buffer, size_t buflen,
387 const char * source, size_t srclen) {
388 ASSERT(NULL != buffer); // TODO: estimate output size
389 if (buflen <= 0)
390 return 0;
391
392 size_t srcpos = 0, bufpos = 0;
393 while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
394 unsigned char ch = source[srcpos++];
395 if (ch != '&') {
396 buffer[bufpos++] = ch;
397 } else if ((srcpos + 2 < srclen)
398 && (memcmp(source + srcpos, "lt;", 3) == 0)) {
399 buffer[bufpos++] = '<';
400 srcpos += 3;
401 } else if ((srcpos + 2 < srclen)
402 && (memcmp(source + srcpos, "gt;", 3) == 0)) {
403 buffer[bufpos++] = '>';
404 srcpos += 3;
405 } else if ((srcpos + 4 < srclen)
406 && (memcmp(source + srcpos, "apos;", 5) == 0)) {
407 buffer[bufpos++] = '\'';
408 srcpos += 5;
409 } else if ((srcpos + 4 < srclen)
410 && (memcmp(source + srcpos, "quot;", 5) == 0)) {
411 buffer[bufpos++] = '\"';
412 srcpos += 5;
413 } else if ((srcpos + 3 < srclen)
414 && (memcmp(source + srcpos, "amp;", 4) == 0)) {
415 buffer[bufpos++] = '&';
416 srcpos += 4;
417 } else if ((srcpos < srclen) && (source[srcpos] == '#')) {
418 int int_base = 10;
419 if ((srcpos + 1 < srclen) && (source[srcpos+1] == 'x')) {
420 int_base = 16;
421 srcpos += 1;
422 }
423 char * ptr;
424 // TODO: Fix hack (ptr may go past end of data)
425 unsigned long val = strtoul(source + srcpos + 1, &ptr, int_base);
426 if ((static_cast<size_t>(ptr - source) < srclen) && (*ptr == ';')) {
427 srcpos = ptr - source + 1;
428 } else {
429 // Not a valid escape sequence.
430 break;
431 }
432 if (size_t esclen = utf8_encode(buffer + bufpos, buflen - bufpos, val)) {
433 bufpos += esclen;
434 } else {
435 // Not enough room to encode the character, or illegal character
436 break;
437 }
438 } else {
439 // Unrecognized escape sequence.
440 break;
441 }
442 }
443 buffer[bufpos] = '\0';
444 return bufpos;
445 }
446
hex_encode(const char * source,size_t srclen)447 std::string hex_encode(const char * source, size_t srclen) {
448 const size_t kBufferSize = srclen * 2 + 1;
449 char* buffer = STACK_ARRAY(char, kBufferSize);
450 size_t length = hex_encode(buffer, kBufferSize, source, srclen);
451 return std::string(buffer, length);
452 }
453
hex_encode(char * buffer,size_t buflen,const char * csource,size_t srclen)454 size_t hex_encode(char * buffer, size_t buflen,
455 const char * csource, size_t srclen) {
456 ASSERT(NULL != buffer); // TODO: estimate output size
457 if (buflen <= 0)
458 return 0;
459
460 const unsigned char * bsource =
461 reinterpret_cast<const unsigned char *>(csource);
462
463 size_t srcpos = 0, bufpos = 0;
464 srclen = _min(srclen, (buflen - 1) / 2);
465 while (srcpos < srclen) {
466 unsigned char ch = bsource[srcpos++];
467 buffer[bufpos ] = hex_encode((ch >> 4) & 0xF);
468 buffer[bufpos+1] = hex_encode((ch ) & 0xF);
469 bufpos += 2;
470 }
471 buffer[bufpos] = '\0';
472 return bufpos;
473 }
474
hex_decode(char * cbuffer,size_t buflen,const char * source,size_t srclen)475 size_t hex_decode(char * cbuffer, size_t buflen,
476 const char * source, size_t srclen) {
477 ASSERT(NULL != cbuffer); // TODO: estimate output size
478 if (buflen <= 0)
479 return 0;
480
481 unsigned char * bbuffer = reinterpret_cast<unsigned char *>(cbuffer);
482
483 unsigned char h1, h2;
484 size_t srcpos = 0, bufpos = 0;
485 while ((srcpos + 1 < srclen)
486 && (bufpos + 1 < buflen)
487 && hex_decode(source[srcpos], &h1)
488 && hex_decode(source[srcpos+1], &h2))
489 {
490 bbuffer[bufpos++] = (h1 << 4) | h2;
491 srcpos += 2;
492 }
493 bbuffer[bufpos] = '\0';
494 return bufpos;
495 }
496
transform(std::string & value,size_t maxlen,const std::string & source,Transform t)497 size_t transform(std::string& value, size_t maxlen, const std::string& source,
498 Transform t) {
499 char* buffer = STACK_ARRAY(char, maxlen + 1);
500 size_t length = t(buffer, maxlen + 1, source.data(), source.length());
501 value.assign(buffer, length);
502 return length;
503 }
504
s_transform(const std::string & source,Transform t)505 std::string s_transform(const std::string& source, Transform t) {
506 // Ask transformation function to approximate the destination size (returns upper bound)
507 size_t maxlen = t(NULL, 0, source.data(), source.length());
508 char * buffer = STACK_ARRAY(char, maxlen);
509 size_t len = t(buffer, maxlen, source.data(), source.length());
510 std::string result(buffer, len);
511 return result;
512 }
513
tokenize(const std::string & source,char delimiter,std::vector<std::string> * fields)514 size_t tokenize(const std::string& source, char delimiter,
515 std::vector<std::string>* fields) {
516 ASSERT(NULL != fields);
517 fields->clear();
518 size_t last = 0;
519 for (size_t i = 0; i < source.length(); ++i) {
520 if (source[i] == delimiter) {
521 if (i != last) {
522 fields->push_back(source.substr(last, i - last));
523 }
524 last = i + 1;
525 }
526 }
527 if (last != source.length()) {
528 fields->push_back(source.substr(last, source.length() - last));
529 }
530 return fields->size();
531 }
532
split(const std::string & source,char delimiter,std::vector<std::string> * fields)533 size_t split(const std::string& source, char delimiter,
534 std::vector<std::string>* fields) {
535 ASSERT(NULL != fields);
536 fields->clear();
537 size_t last = 0;
538 for (size_t i = 0; i < source.length(); ++i) {
539 if (source[i] == delimiter) {
540 fields->push_back(source.substr(last, i - last));
541 last = i + 1;
542 }
543 }
544 fields->push_back(source.substr(last, source.length() - last));
545 return fields->size();
546 }
547
make_char_safe_for_filename(char c)548 char make_char_safe_for_filename(char c) {
549 if (c < 32)
550 return '_';
551
552 switch (c) {
553 case '<':
554 case '>':
555 case ':':
556 case '"':
557 case '/':
558 case '\\':
559 case '|':
560 case '*':
561 case '?':
562 return '_';
563
564 default:
565 return c;
566 }
567 }
568
569 /*
570 void sprintf(std::string& value, size_t maxlen, const char * format, ...) {
571 char * buffer = STACK_ARRAY(char, maxlen + 1);
572 va_list args;
573 va_start(args, format);
574 value.assign(buffer, vsprintfn(buffer, maxlen + 1, format, args));
575 va_end(args);
576 }
577 */
578
579 /////////////////////////////////////////////////////////////////////////////
580
581 } // namespace talk_base
582