1 /*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16 #define IN_LIBXML
17 #include "libxml.h"
18
19 #include <stdlib.h>
20 #include <string.h>
21 #include <limits.h>
22 #include <libxml/xmlmemory.h>
23 #include <libxml/parserInternals.h>
24 #include <libxml/xmlstring.h>
25
26 #include "private/parser.h"
27 #include "private/string.h"
28
29 /************************************************************************
30 * *
31 * Commodity functions to handle xmlChars *
32 * *
33 ************************************************************************/
34
35 /**
36 * xmlStrndup:
37 * @cur: the input xmlChar *
38 * @len: the len of @cur
39 *
40 * a strndup for array of xmlChar's
41 *
42 * Returns a new xmlChar * or NULL
43 */
44 xmlChar *
xmlStrndup(const xmlChar * cur,int len)45 xmlStrndup(const xmlChar *cur, int len) {
46 xmlChar *ret;
47
48 if ((cur == NULL) || (len < 0)) return(NULL);
49 ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
50 if (ret == NULL) {
51 return(NULL);
52 }
53 memcpy(ret, cur, len);
54 ret[len] = 0;
55 return(ret);
56 }
57
58 /**
59 * xmlStrdup:
60 * @cur: the input xmlChar *
61 *
62 * a strdup for array of xmlChar's. Since they are supposed to be
63 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
64 * a termination mark of '0'.
65 *
66 * Returns a new xmlChar * or NULL
67 */
68 xmlChar *
xmlStrdup(const xmlChar * cur)69 xmlStrdup(const xmlChar *cur) {
70 const xmlChar *p = cur;
71
72 if (cur == NULL) return(NULL);
73 while (*p != 0) p++; /* non input consuming */
74 return(xmlStrndup(cur, p - cur));
75 }
76
77 /**
78 * xmlCharStrndup:
79 * @cur: the input char *
80 * @len: the len of @cur
81 *
82 * a strndup for char's to xmlChar's
83 *
84 * Returns a new xmlChar * or NULL
85 */
86
87 xmlChar *
xmlCharStrndup(const char * cur,int len)88 xmlCharStrndup(const char *cur, int len) {
89 int i;
90 xmlChar *ret;
91
92 if ((cur == NULL) || (len < 0)) return(NULL);
93 ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
94 if (ret == NULL) {
95 return(NULL);
96 }
97 for (i = 0;i < len;i++) {
98 /* Explicit sign change */
99 ret[i] = (xmlChar) cur[i];
100 if (ret[i] == 0) return(ret);
101 }
102 ret[len] = 0;
103 return(ret);
104 }
105
106 /**
107 * xmlCharStrdup:
108 * @cur: the input char *
109 *
110 * a strdup for char's to xmlChar's
111 *
112 * Returns a new xmlChar * or NULL
113 */
114
115 xmlChar *
xmlCharStrdup(const char * cur)116 xmlCharStrdup(const char *cur) {
117 const char *p = cur;
118
119 if (cur == NULL) return(NULL);
120 while (*p != '\0') p++; /* non input consuming */
121 return(xmlCharStrndup(cur, p - cur));
122 }
123
124 /**
125 * xmlStrcmp:
126 * @str1: the first xmlChar *
127 * @str2: the second xmlChar *
128 *
129 * a strcmp for xmlChar's
130 *
131 * Returns the integer result of the comparison
132 */
133
134 int
xmlStrcmp(const xmlChar * str1,const xmlChar * str2)135 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
136 if (str1 == str2) return(0);
137 if (str1 == NULL) return(-1);
138 if (str2 == NULL) return(1);
139 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
140 return(strcmp((const char *)str1, (const char *)str2));
141 #else
142 do {
143 int tmp = *str1++ - *str2;
144 if (tmp != 0) return(tmp);
145 } while (*str2++ != 0);
146 return 0;
147 #endif
148 }
149
150 /**
151 * xmlStrEqual:
152 * @str1: the first xmlChar *
153 * @str2: the second xmlChar *
154 *
155 * Check if both strings are equal of have same content.
156 * Should be a bit more readable and faster than xmlStrcmp()
157 *
158 * Returns 1 if they are equal, 0 if they are different
159 */
160
161 int
xmlStrEqual(const xmlChar * str1,const xmlChar * str2)162 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
163 if (str1 == str2) return(1);
164 if (str1 == NULL) return(0);
165 if (str2 == NULL) return(0);
166 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
167 return(strcmp((const char *)str1, (const char *)str2) == 0);
168 #else
169 do {
170 if (*str1++ != *str2) return(0);
171 } while (*str2++);
172 return(1);
173 #endif
174 }
175
176 /**
177 * xmlStrQEqual:
178 * @pref: the prefix of the QName
179 * @name: the localname of the QName
180 * @str: the second xmlChar *
181 *
182 * Check if a QName is Equal to a given string
183 *
184 * Returns 1 if they are equal, 0 if they are different
185 */
186
187 int
xmlStrQEqual(const xmlChar * pref,const xmlChar * name,const xmlChar * str)188 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
189 if (pref == NULL) return(xmlStrEqual(name, str));
190 if (name == NULL) return(0);
191 if (str == NULL) return(0);
192
193 do {
194 if (*pref++ != *str) return(0);
195 } while ((*str++) && (*pref));
196 if (*str++ != ':') return(0);
197 do {
198 if (*name++ != *str) return(0);
199 } while (*str++);
200 return(1);
201 }
202
203 /**
204 * xmlStrncmp:
205 * @str1: the first xmlChar *
206 * @str2: the second xmlChar *
207 * @len: the max comparison length
208 *
209 * a strncmp for xmlChar's
210 *
211 * Returns the integer result of the comparison
212 */
213
214 int
xmlStrncmp(const xmlChar * str1,const xmlChar * str2,int len)215 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
216 if (len <= 0) return(0);
217 if (str1 == str2) return(0);
218 if (str1 == NULL) return(-1);
219 if (str2 == NULL) return(1);
220 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
221 return(strncmp((const char *)str1, (const char *)str2, len));
222 #else
223 do {
224 int tmp = *str1++ - *str2;
225 if (tmp != 0 || --len == 0) return(tmp);
226 } while (*str2++ != 0);
227 return 0;
228 #endif
229 }
230
231 static const xmlChar casemap[256] = {
232 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
233 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
234 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
235 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
236 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
237 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
238 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
239 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
240 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
241 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
242 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
243 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
244 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
245 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
246 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
247 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
248 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
249 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
250 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
251 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
252 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
253 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
254 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
255 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
256 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
257 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
258 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
259 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
260 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
261 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
262 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
263 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
264 };
265
266 /**
267 * xmlStrcasecmp:
268 * @str1: the first xmlChar *
269 * @str2: the second xmlChar *
270 *
271 * a strcasecmp for xmlChar's
272 *
273 * Returns the integer result of the comparison
274 */
275
276 int
xmlStrcasecmp(const xmlChar * str1,const xmlChar * str2)277 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
278 register int tmp;
279
280 if (str1 == str2) return(0);
281 if (str1 == NULL) return(-1);
282 if (str2 == NULL) return(1);
283 do {
284 tmp = casemap[*str1++] - casemap[*str2];
285 if (tmp != 0) return(tmp);
286 } while (*str2++ != 0);
287 return 0;
288 }
289
290 /**
291 * xmlStrncasecmp:
292 * @str1: the first xmlChar *
293 * @str2: the second xmlChar *
294 * @len: the max comparison length
295 *
296 * a strncasecmp for xmlChar's
297 *
298 * Returns the integer result of the comparison
299 */
300
301 int
xmlStrncasecmp(const xmlChar * str1,const xmlChar * str2,int len)302 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
303 register int tmp;
304
305 if (len <= 0) return(0);
306 if (str1 == str2) return(0);
307 if (str1 == NULL) return(-1);
308 if (str2 == NULL) return(1);
309 do {
310 tmp = casemap[*str1++] - casemap[*str2];
311 if (tmp != 0 || --len == 0) return(tmp);
312 } while (*str2++ != 0);
313 return 0;
314 }
315
316 /**
317 * xmlStrchr:
318 * @str: the xmlChar * array
319 * @val: the xmlChar to search
320 *
321 * a strchr for xmlChar's
322 *
323 * Returns the xmlChar * for the first occurrence or NULL.
324 */
325
326 const xmlChar *
xmlStrchr(const xmlChar * str,xmlChar val)327 xmlStrchr(const xmlChar *str, xmlChar val) {
328 if (str == NULL) return(NULL);
329 while (*str != 0) { /* non input consuming */
330 if (*str == val) return((xmlChar *) str);
331 str++;
332 }
333 return(NULL);
334 }
335
336 /**
337 * xmlStrstr:
338 * @str: the xmlChar * array (haystack)
339 * @val: the xmlChar to search (needle)
340 *
341 * a strstr for xmlChar's
342 *
343 * Returns the xmlChar * for the first occurrence or NULL.
344 */
345
346 const xmlChar *
xmlStrstr(const xmlChar * str,const xmlChar * val)347 xmlStrstr(const xmlChar *str, const xmlChar *val) {
348 int n;
349
350 if (str == NULL) return(NULL);
351 if (val == NULL) return(NULL);
352 n = xmlStrlen(val);
353
354 if (n == 0) return(str);
355 while (*str != 0) { /* non input consuming */
356 if (*str == *val) {
357 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
358 }
359 str++;
360 }
361 return(NULL);
362 }
363
364 /**
365 * xmlStrcasestr:
366 * @str: the xmlChar * array (haystack)
367 * @val: the xmlChar to search (needle)
368 *
369 * a case-ignoring strstr for xmlChar's
370 *
371 * Returns the xmlChar * for the first occurrence or NULL.
372 */
373
374 const xmlChar *
xmlStrcasestr(const xmlChar * str,const xmlChar * val)375 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
376 int n;
377
378 if (str == NULL) return(NULL);
379 if (val == NULL) return(NULL);
380 n = xmlStrlen(val);
381
382 if (n == 0) return(str);
383 while (*str != 0) { /* non input consuming */
384 if (casemap[*str] == casemap[*val])
385 if (!xmlStrncasecmp(str, val, n)) return(str);
386 str++;
387 }
388 return(NULL);
389 }
390
391 /**
392 * xmlStrsub:
393 * @str: the xmlChar * array (haystack)
394 * @start: the index of the first char (zero based)
395 * @len: the length of the substring
396 *
397 * Extract a substring of a given string
398 *
399 * Returns the xmlChar * for the first occurrence or NULL.
400 */
401
402 xmlChar *
xmlStrsub(const xmlChar * str,int start,int len)403 xmlStrsub(const xmlChar *str, int start, int len) {
404 int i;
405
406 if (str == NULL) return(NULL);
407 if (start < 0) return(NULL);
408 if (len < 0) return(NULL);
409
410 for (i = 0;i < start;i++) {
411 if (*str == 0) return(NULL);
412 str++;
413 }
414 if (*str == 0) return(NULL);
415 return(xmlStrndup(str, len));
416 }
417
418 /**
419 * xmlStrlen:
420 * @str: the xmlChar * array
421 *
422 * length of a xmlChar's string
423 *
424 * Returns the number of xmlChar contained in the ARRAY.
425 */
426
427 int
xmlStrlen(const xmlChar * str)428 xmlStrlen(const xmlChar *str) {
429 size_t len = str ? strlen((const char *)str) : 0;
430 return(len > INT_MAX ? 0 : len);
431 }
432
433 /**
434 * xmlStrncat:
435 * @cur: the original xmlChar * array
436 * @add: the xmlChar * array added
437 * @len: the length of @add
438 *
439 * a strncat for array of xmlChar's, it will extend @cur with the len
440 * first bytes of @add. Note that if @len < 0 then this is an API error
441 * and NULL will be returned.
442 *
443 * Returns a new xmlChar *, the original @cur is reallocated and should
444 * not be freed.
445 */
446
447 xmlChar *
xmlStrncat(xmlChar * cur,const xmlChar * add,int len)448 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449 int size;
450 xmlChar *ret;
451
452 if ((add == NULL) || (len == 0))
453 return(cur);
454 if (len < 0)
455 return(NULL);
456 if (cur == NULL)
457 return(xmlStrndup(add, len));
458
459 size = xmlStrlen(cur);
460 if ((size < 0) || (size > INT_MAX - len))
461 return(NULL);
462 ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
463 if (ret == NULL) {
464 return(cur);
465 }
466 memcpy(&ret[size], add, len);
467 ret[size + len] = 0;
468 return(ret);
469 }
470
471 /**
472 * xmlStrncatNew:
473 * @str1: first xmlChar string
474 * @str2: second xmlChar string
475 * @len: the len of @str2 or < 0
476 *
477 * same as xmlStrncat, but creates a new string. The original
478 * two strings are not freed. If @len is < 0 then the length
479 * will be calculated automatically.
480 *
481 * Returns a new xmlChar * or NULL
482 */
483 xmlChar *
xmlStrncatNew(const xmlChar * str1,const xmlChar * str2,int len)484 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
485 int size;
486 xmlChar *ret;
487
488 if (len < 0) {
489 len = xmlStrlen(str2);
490 if (len < 0)
491 return(NULL);
492 }
493 if ((str2 == NULL) || (len == 0))
494 return(xmlStrdup(str1));
495 if (str1 == NULL)
496 return(xmlStrndup(str2, len));
497
498 size = xmlStrlen(str1);
499 if ((size < 0) || (size > INT_MAX - len))
500 return(NULL);
501 ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
502 if (ret == NULL) {
503 return(xmlStrndup(str1, size));
504 }
505 memcpy(ret, str1, size);
506 memcpy(&ret[size], str2, len);
507 ret[size + len] = 0;
508 return(ret);
509 }
510
511 /**
512 * xmlStrcat:
513 * @cur: the original xmlChar * array
514 * @add: the xmlChar * array added
515 *
516 * a strcat for array of xmlChar's. Since they are supposed to be
517 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
518 * a termination mark of '0'.
519 *
520 * Returns a new xmlChar * containing the concatenated string. The original
521 * @cur is reallocated and should not be freed.
522 */
523 xmlChar *
xmlStrcat(xmlChar * cur,const xmlChar * add)524 xmlStrcat(xmlChar *cur, const xmlChar *add) {
525 const xmlChar *p = add;
526
527 if (add == NULL) return(cur);
528 if (cur == NULL)
529 return(xmlStrdup(add));
530
531 while (*p != 0) p++; /* non input consuming */
532 return(xmlStrncat(cur, add, p - add));
533 }
534
535 /**
536 * xmlStrPrintf:
537 * @buf: the result buffer.
538 * @len: the result buffer length.
539 * @msg: the message with printf formatting.
540 * @...: extra parameters for the message.
541 *
542 * Formats @msg and places result into @buf.
543 *
544 * Returns the number of characters written to @buf or -1 if an error occurs.
545 */
546 int
xmlStrPrintf(xmlChar * buf,int len,const char * msg,...)547 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
548 va_list args;
549 int ret;
550
551 if((buf == NULL) || (msg == NULL)) {
552 return(-1);
553 }
554
555 va_start(args, msg);
556 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
557 va_end(args);
558 buf[len - 1] = 0; /* be safe ! */
559
560 return(ret);
561 }
562
563 /**
564 * xmlStrVPrintf:
565 * @buf: the result buffer.
566 * @len: the result buffer length.
567 * @msg: the message with printf formatting.
568 * @ap: extra parameters for the message.
569 *
570 * Formats @msg and places result into @buf.
571 *
572 * Returns the number of characters written to @buf or -1 if an error occurs.
573 */
574 int
xmlStrVPrintf(xmlChar * buf,int len,const char * msg,va_list ap)575 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
576 int ret;
577
578 if((buf == NULL) || (msg == NULL)) {
579 return(-1);
580 }
581
582 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
583 buf[len - 1] = 0; /* be safe ! */
584
585 return(ret);
586 }
587
588 /************************************************************************
589 * *
590 * Generic UTF8 handling routines *
591 * *
592 * From rfc2044: encoding of the Unicode values on UTF-8: *
593 * *
594 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
595 * 0000 0000-0000 007F 0xxxxxxx *
596 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
597 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
598 * *
599 * I hope we won't use values > 0xFFFF anytime soon ! *
600 * *
601 ************************************************************************/
602
603
604 /**
605 * xmlUTF8Size:
606 * @utf: pointer to the UTF8 character
607 *
608 * calculates the internal size of a UTF8 character
609 *
610 * returns the numbers of bytes in the character, -1 on format error
611 */
612 int
xmlUTF8Size(const xmlChar * utf)613 xmlUTF8Size(const xmlChar *utf) {
614 xmlChar mask;
615 int len;
616
617 if (utf == NULL)
618 return -1;
619 if (*utf < 0x80)
620 return 1;
621 /* check valid UTF8 character */
622 if (!(*utf & 0x40))
623 return -1;
624 /* determine number of bytes in char */
625 len = 2;
626 for (mask=0x20; mask != 0; mask>>=1) {
627 if (!(*utf & mask))
628 return len;
629 len++;
630 }
631 return -1;
632 }
633
634 /**
635 * xmlUTF8Charcmp:
636 * @utf1: pointer to first UTF8 char
637 * @utf2: pointer to second UTF8 char
638 *
639 * compares the two UCS4 values
640 *
641 * returns result of the compare as with xmlStrncmp
642 */
643 int
xmlUTF8Charcmp(const xmlChar * utf1,const xmlChar * utf2)644 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
645
646 if (utf1 == NULL ) {
647 if (utf2 == NULL)
648 return 0;
649 return -1;
650 }
651 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
652 }
653
654 /**
655 * xmlUTF8Strlen:
656 * @utf: a sequence of UTF-8 encoded bytes
657 *
658 * compute the length of an UTF8 string, it doesn't do a full UTF8
659 * checking of the content of the string.
660 *
661 * Returns the number of characters in the string or -1 in case of error
662 */
663 int
xmlUTF8Strlen(const xmlChar * utf)664 xmlUTF8Strlen(const xmlChar *utf) {
665 size_t ret = 0;
666
667 if (utf == NULL)
668 return(-1);
669
670 while (*utf != 0) {
671 if (utf[0] & 0x80) {
672 if ((utf[1] & 0xc0) != 0x80)
673 return(-1);
674 if ((utf[0] & 0xe0) == 0xe0) {
675 if ((utf[2] & 0xc0) != 0x80)
676 return(-1);
677 if ((utf[0] & 0xf0) == 0xf0) {
678 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
679 return(-1);
680 utf += 4;
681 } else {
682 utf += 3;
683 }
684 } else {
685 utf += 2;
686 }
687 } else {
688 utf++;
689 }
690 ret++;
691 }
692 return(ret > INT_MAX ? 0 : ret);
693 }
694
695 /**
696 * xmlGetUTF8Char:
697 * @utf: a sequence of UTF-8 encoded bytes
698 * @len: a pointer to the minimum number of bytes present in
699 * the sequence. This is used to assure the next character
700 * is completely contained within the sequence.
701 *
702 * Read the first UTF8 character from @utf
703 *
704 * Returns the char value or -1 in case of error, and sets *len to
705 * the actual number of bytes consumed (0 in case of error)
706 */
707 int
xmlGetUTF8Char(const unsigned char * utf,int * len)708 xmlGetUTF8Char(const unsigned char *utf, int *len) {
709 unsigned int c;
710
711 if (utf == NULL)
712 goto error;
713 if (len == NULL)
714 goto error;
715 if (*len < 1)
716 goto error;
717
718 c = utf[0];
719 if (c & 0x80) {
720 if (*len < 2)
721 goto error;
722 if ((utf[1] & 0xc0) != 0x80)
723 goto error;
724 if ((c & 0xe0) == 0xe0) {
725 if (*len < 3)
726 goto error;
727 if ((utf[2] & 0xc0) != 0x80)
728 goto error;
729 if ((c & 0xf0) == 0xf0) {
730 if (*len < 4)
731 goto error;
732 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
733 goto error;
734 *len = 4;
735 /* 4-byte code */
736 c = (utf[0] & 0x7) << 18;
737 c |= (utf[1] & 0x3f) << 12;
738 c |= (utf[2] & 0x3f) << 6;
739 c |= utf[3] & 0x3f;
740 } else {
741 /* 3-byte code */
742 *len = 3;
743 c = (utf[0] & 0xf) << 12;
744 c |= (utf[1] & 0x3f) << 6;
745 c |= utf[2] & 0x3f;
746 }
747 } else {
748 /* 2-byte code */
749 *len = 2;
750 c = (utf[0] & 0x1f) << 6;
751 c |= utf[1] & 0x3f;
752 }
753 } else {
754 /* 1-byte code */
755 *len = 1;
756 }
757 return(c);
758
759 error:
760 if (len != NULL)
761 *len = 0;
762 return(-1);
763 }
764
765 /**
766 * xmlCheckUTF8:
767 * @utf: Pointer to putative UTF-8 encoded string.
768 *
769 * Checks @utf for being valid UTF-8. @utf is assumed to be
770 * null-terminated. This function is not super-strict, as it will
771 * allow longer UTF-8 sequences than necessary. Note that Java is
772 * capable of producing these sequences if provoked. Also note, this
773 * routine checks for the 4-byte maximum size, but does not check for
774 * 0x10ffff maximum value.
775 *
776 * Return value: true if @utf is valid.
777 **/
778 int
xmlCheckUTF8(const unsigned char * utf)779 xmlCheckUTF8(const unsigned char *utf)
780 {
781 int ix;
782 unsigned char c;
783
784 if (utf == NULL)
785 return(0);
786 /*
787 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
788 * are as follows (in "bit format"):
789 * 0xxxxxxx valid 1-byte
790 * 110xxxxx 10xxxxxx valid 2-byte
791 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
792 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
793 */
794 while ((c = utf[0])) { /* string is 0-terminated */
795 ix = 0;
796 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
797 ix = 1;
798 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
799 if ((utf[1] & 0xc0 ) != 0x80)
800 return 0;
801 ix = 2;
802 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
803 if (((utf[1] & 0xc0) != 0x80) ||
804 ((utf[2] & 0xc0) != 0x80))
805 return 0;
806 ix = 3;
807 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
808 if (((utf[1] & 0xc0) != 0x80) ||
809 ((utf[2] & 0xc0) != 0x80) ||
810 ((utf[3] & 0xc0) != 0x80))
811 return 0;
812 ix = 4;
813 } else /* unknown encoding */
814 return 0;
815 utf += ix;
816 }
817 return(1);
818 }
819
820 /**
821 * xmlUTF8Strsize:
822 * @utf: a sequence of UTF-8 encoded bytes
823 * @len: the number of characters in the array
824 *
825 * storage size of an UTF8 string
826 * the behaviour is not guaranteed if the input string is not UTF-8
827 *
828 * Returns the storage size of
829 * the first 'len' characters of ARRAY
830 */
831
832 int
xmlUTF8Strsize(const xmlChar * utf,int len)833 xmlUTF8Strsize(const xmlChar *utf, int len) {
834 const xmlChar *ptr=utf;
835 int ch;
836 size_t ret;
837
838 if (utf == NULL)
839 return(0);
840
841 if (len <= 0)
842 return(0);
843
844 while ( len-- > 0) {
845 if ( !*ptr )
846 break;
847 if ( (ch = *ptr++) & 0x80)
848 while ((ch<<=1) & 0x80 ) {
849 if (*ptr == 0) break;
850 ptr++;
851 }
852 }
853 ret = ptr - utf;
854 return (ret > INT_MAX ? 0 : ret);
855 }
856
857
858 /**
859 * xmlUTF8Strndup:
860 * @utf: the input UTF8 *
861 * @len: the len of @utf (in chars)
862 *
863 * a strndup for array of UTF8's
864 *
865 * Returns a new UTF8 * or NULL
866 */
867 xmlChar *
xmlUTF8Strndup(const xmlChar * utf,int len)868 xmlUTF8Strndup(const xmlChar *utf, int len) {
869 xmlChar *ret;
870 int i;
871
872 if ((utf == NULL) || (len < 0)) return(NULL);
873 i = xmlUTF8Strsize(utf, len);
874 ret = (xmlChar *) xmlMallocAtomic((size_t) i + 1);
875 if (ret == NULL) {
876 return(NULL);
877 }
878 memcpy(ret, utf, i);
879 ret[i] = 0;
880 return(ret);
881 }
882
883 /**
884 * xmlUTF8Strpos:
885 * @utf: the input UTF8 *
886 * @pos: the position of the desired UTF8 char (in chars)
887 *
888 * a function to provide the equivalent of fetching a
889 * character from a string array
890 *
891 * Returns a pointer to the UTF8 character or NULL
892 */
893 const xmlChar *
xmlUTF8Strpos(const xmlChar * utf,int pos)894 xmlUTF8Strpos(const xmlChar *utf, int pos) {
895 int ch;
896
897 if (utf == NULL) return(NULL);
898 if (pos < 0)
899 return(NULL);
900 while (pos--) {
901 if ((ch=*utf++) == 0) return(NULL);
902 if ( ch & 0x80 ) {
903 /* if not simple ascii, verify proper format */
904 if ( (ch & 0xc0) != 0xc0 )
905 return(NULL);
906 /* then skip over remaining bytes for this char */
907 while ( (ch <<= 1) & 0x80 )
908 if ( (*utf++ & 0xc0) != 0x80 )
909 return(NULL);
910 }
911 }
912 return((xmlChar *)utf);
913 }
914
915 /**
916 * xmlUTF8Strloc:
917 * @utf: the input UTF8 *
918 * @utfchar: the UTF8 character to be found
919 *
920 * a function to provide the relative location of a UTF8 char
921 *
922 * Returns the relative character position of the desired char
923 * or -1 if not found
924 */
925 int
xmlUTF8Strloc(const xmlChar * utf,const xmlChar * utfchar)926 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
927 size_t i;
928 int size;
929 int ch;
930
931 if (utf==NULL || utfchar==NULL) return -1;
932 size = xmlUTF8Strsize(utfchar, 1);
933 for(i=0; (ch=*utf) != 0; i++) {
934 if (xmlStrncmp(utf, utfchar, size)==0)
935 return(i > INT_MAX ? 0 : i);
936 utf++;
937 if ( ch & 0x80 ) {
938 /* if not simple ascii, verify proper format */
939 if ( (ch & 0xc0) != 0xc0 )
940 return(-1);
941 /* then skip over remaining bytes for this char */
942 while ( (ch <<= 1) & 0x80 )
943 if ( (*utf++ & 0xc0) != 0x80 )
944 return(-1);
945 }
946 }
947
948 return(-1);
949 }
950 /**
951 * xmlUTF8Strsub:
952 * @utf: a sequence of UTF-8 encoded bytes
953 * @start: relative pos of first char
954 * @len: total number to copy
955 *
956 * Create a substring from a given UTF-8 string
957 * Note: positions are given in units of UTF-8 chars
958 *
959 * Returns a pointer to a newly created string
960 * or NULL if any problem
961 */
962
963 xmlChar *
xmlUTF8Strsub(const xmlChar * utf,int start,int len)964 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
965 int i;
966 int ch;
967
968 if (utf == NULL) return(NULL);
969 if (start < 0) return(NULL);
970 if (len < 0) return(NULL);
971
972 /*
973 * Skip over any leading chars
974 */
975 for (i = 0;i < start;i++) {
976 if ((ch=*utf++) == 0) return(NULL);
977 if ( ch & 0x80 ) {
978 /* if not simple ascii, verify proper format */
979 if ( (ch & 0xc0) != 0xc0 )
980 return(NULL);
981 /* then skip over remaining bytes for this char */
982 while ( (ch <<= 1) & 0x80 )
983 if ( (*utf++ & 0xc0) != 0x80 )
984 return(NULL);
985 }
986 }
987
988 return(xmlUTF8Strndup(utf, len));
989 }
990
991 /**
992 * xmlEscapeFormatString:
993 * @msg: a pointer to the string in which to escape '%' characters.
994 * Must be a heap-allocated buffer created by libxml2 that may be
995 * returned, or that may be freed and replaced.
996 *
997 * Replaces the string pointed to by 'msg' with an escaped string.
998 * Returns the same string with all '%' characters escaped.
999 */
1000 xmlChar *
xmlEscapeFormatString(xmlChar ** msg)1001 xmlEscapeFormatString(xmlChar **msg)
1002 {
1003 xmlChar *msgPtr = NULL;
1004 xmlChar *result = NULL;
1005 xmlChar *resultPtr = NULL;
1006 size_t count = 0;
1007 size_t msgLen = 0;
1008 size_t resultLen = 0;
1009
1010 if (!msg || !*msg)
1011 return(NULL);
1012
1013 for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1014 ++msgLen;
1015 if (*msgPtr == '%')
1016 ++count;
1017 }
1018
1019 if (count == 0)
1020 return(*msg);
1021
1022 if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1023 return(NULL);
1024 resultLen = msgLen + count + 1;
1025 result = (xmlChar *) xmlMallocAtomic(resultLen);
1026 if (result == NULL) {
1027 /* Clear *msg to prevent format string vulnerabilities in
1028 out-of-memory situations. */
1029 xmlFree(*msg);
1030 *msg = NULL;
1031 return(NULL);
1032 }
1033
1034 for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1035 *resultPtr = *msgPtr;
1036 if (*msgPtr == '%')
1037 *(++resultPtr) = '%';
1038 }
1039 result[resultLen - 1] = '\0';
1040
1041 xmlFree(*msg);
1042 *msg = result;
1043
1044 return *msg;
1045 }
1046
1047