• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * string.c : an XML string utilities module
3  *
4  * This module provides various utility functions for manipulating
5  * the xmlChar* type. All functions named xmlStr* have been moved here
6  * from the parser.c file (their original home).
7  *
8  * See Copyright for the status of this software.
9  *
10  * UTF8 string routines from:
11  * William Brack <wbrack@mmm.com.hk>
12  *
13  * daniel@veillard.com
14  */
15 
16 #define IN_LIBXML
17 #include "libxml.h"
18 
19 #include <stdlib.h>
20 #include <string.h>
21 #include <limits.h>
22 #include <libxml/xmlmemory.h>
23 #include <libxml/parserInternals.h>
24 #include <libxml/xmlstring.h>
25 
26 #include "private/parser.h"
27 #include "private/string.h"
28 
29 /************************************************************************
30  *                                                                      *
31  *                Commodity functions to handle xmlChars                *
32  *                                                                      *
33  ************************************************************************/
34 
35 /**
36  * xmlStrndup:
37  * @cur:  the input xmlChar *
38  * @len:  the len of @cur
39  *
40  * a strndup for array of xmlChar's
41  *
42  * Returns a new xmlChar * or NULL
43  */
44 xmlChar *
xmlStrndup(const xmlChar * cur,int len)45 xmlStrndup(const xmlChar *cur, int len) {
46     xmlChar *ret;
47 
48     if ((cur == NULL) || (len < 0)) return(NULL);
49     ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
50     if (ret == NULL) {
51         return(NULL);
52     }
53     memcpy(ret, cur, len);
54     ret[len] = 0;
55     return(ret);
56 }
57 
58 /**
59  * xmlStrdup:
60  * @cur:  the input xmlChar *
61  *
62  * a strdup for array of xmlChar's. Since they are supposed to be
63  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
64  * a termination mark of '0'.
65  *
66  * Returns a new xmlChar * or NULL
67  */
68 xmlChar *
xmlStrdup(const xmlChar * cur)69 xmlStrdup(const xmlChar *cur) {
70     const xmlChar *p = cur;
71 
72     if (cur == NULL) return(NULL);
73     while (*p != 0) p++; /* non input consuming */
74     return(xmlStrndup(cur, p - cur));
75 }
76 
77 /**
78  * xmlCharStrndup:
79  * @cur:  the input char *
80  * @len:  the len of @cur
81  *
82  * a strndup for char's to xmlChar's
83  *
84  * Returns a new xmlChar * or NULL
85  */
86 
87 xmlChar *
xmlCharStrndup(const char * cur,int len)88 xmlCharStrndup(const char *cur, int len) {
89     int i;
90     xmlChar *ret;
91 
92     if ((cur == NULL) || (len < 0)) return(NULL);
93     ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
94     if (ret == NULL) {
95         return(NULL);
96     }
97     for (i = 0;i < len;i++) {
98         /* Explicit sign change */
99         ret[i] = (xmlChar) cur[i];
100         if (ret[i] == 0) return(ret);
101     }
102     ret[len] = 0;
103     return(ret);
104 }
105 
106 /**
107  * xmlCharStrdup:
108  * @cur:  the input char *
109  *
110  * a strdup for char's to xmlChar's
111  *
112  * Returns a new xmlChar * or NULL
113  */
114 
115 xmlChar *
xmlCharStrdup(const char * cur)116 xmlCharStrdup(const char *cur) {
117     const char *p = cur;
118 
119     if (cur == NULL) return(NULL);
120     while (*p != '\0') p++; /* non input consuming */
121     return(xmlCharStrndup(cur, p - cur));
122 }
123 
124 /**
125  * xmlStrcmp:
126  * @str1:  the first xmlChar *
127  * @str2:  the second xmlChar *
128  *
129  * a strcmp for xmlChar's
130  *
131  * Returns the integer result of the comparison
132  */
133 
134 int
xmlStrcmp(const xmlChar * str1,const xmlChar * str2)135 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
136     if (str1 == str2) return(0);
137     if (str1 == NULL) return(-1);
138     if (str2 == NULL) return(1);
139 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
140     return(strcmp((const char *)str1, (const char *)str2));
141 #else
142     do {
143         int tmp = *str1++ - *str2;
144         if (tmp != 0) return(tmp);
145     } while (*str2++ != 0);
146     return 0;
147 #endif
148 }
149 
150 /**
151  * xmlStrEqual:
152  * @str1:  the first xmlChar *
153  * @str2:  the second xmlChar *
154  *
155  * Check if both strings are equal of have same content.
156  * Should be a bit more readable and faster than xmlStrcmp()
157  *
158  * Returns 1 if they are equal, 0 if they are different
159  */
160 
161 int
xmlStrEqual(const xmlChar * str1,const xmlChar * str2)162 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
163     if (str1 == str2) return(1);
164     if (str1 == NULL) return(0);
165     if (str2 == NULL) return(0);
166 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
167     return(strcmp((const char *)str1, (const char *)str2) == 0);
168 #else
169     do {
170         if (*str1++ != *str2) return(0);
171     } while (*str2++);
172     return(1);
173 #endif
174 }
175 
176 /**
177  * xmlStrQEqual:
178  * @pref:  the prefix of the QName
179  * @name:  the localname of the QName
180  * @str:  the second xmlChar *
181  *
182  * Check if a QName is Equal to a given string
183  *
184  * Returns 1 if they are equal, 0 if they are different
185  */
186 
187 int
xmlStrQEqual(const xmlChar * pref,const xmlChar * name,const xmlChar * str)188 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
189     if (pref == NULL) return(xmlStrEqual(name, str));
190     if (name == NULL) return(0);
191     if (str == NULL) return(0);
192 
193     do {
194         if (*pref++ != *str) return(0);
195     } while ((*str++) && (*pref));
196     if (*str++ != ':') return(0);
197     do {
198         if (*name++ != *str) return(0);
199     } while (*str++);
200     return(1);
201 }
202 
203 /**
204  * xmlStrncmp:
205  * @str1:  the first xmlChar *
206  * @str2:  the second xmlChar *
207  * @len:  the max comparison length
208  *
209  * a strncmp for xmlChar's
210  *
211  * Returns the integer result of the comparison
212  */
213 
214 int
xmlStrncmp(const xmlChar * str1,const xmlChar * str2,int len)215 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
216     if (len <= 0) return(0);
217     if (str1 == str2) return(0);
218     if (str1 == NULL) return(-1);
219     if (str2 == NULL) return(1);
220 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
221     return(strncmp((const char *)str1, (const char *)str2, len));
222 #else
223     do {
224         int tmp = *str1++ - *str2;
225         if (tmp != 0 || --len == 0) return(tmp);
226     } while (*str2++ != 0);
227     return 0;
228 #endif
229 }
230 
231 static const xmlChar casemap[256] = {
232     0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
233     0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
234     0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
235     0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
236     0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
237     0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
238     0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
239     0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
240     0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
241     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
242     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
243     0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
244     0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
245     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
246     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
247     0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
248     0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
249     0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
250     0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
251     0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
252     0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
253     0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
254     0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
255     0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
256     0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
257     0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
258     0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
259     0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
260     0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
261     0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
262     0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
263     0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
264 };
265 
266 /**
267  * xmlStrcasecmp:
268  * @str1:  the first xmlChar *
269  * @str2:  the second xmlChar *
270  *
271  * a strcasecmp for xmlChar's
272  *
273  * Returns the integer result of the comparison
274  */
275 
276 int
xmlStrcasecmp(const xmlChar * str1,const xmlChar * str2)277 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
278     register int tmp;
279 
280     if (str1 == str2) return(0);
281     if (str1 == NULL) return(-1);
282     if (str2 == NULL) return(1);
283     do {
284         tmp = casemap[*str1++] - casemap[*str2];
285         if (tmp != 0) return(tmp);
286     } while (*str2++ != 0);
287     return 0;
288 }
289 
290 /**
291  * xmlStrncasecmp:
292  * @str1:  the first xmlChar *
293  * @str2:  the second xmlChar *
294  * @len:  the max comparison length
295  *
296  * a strncasecmp for xmlChar's
297  *
298  * Returns the integer result of the comparison
299  */
300 
301 int
xmlStrncasecmp(const xmlChar * str1,const xmlChar * str2,int len)302 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
303     register int tmp;
304 
305     if (len <= 0) return(0);
306     if (str1 == str2) return(0);
307     if (str1 == NULL) return(-1);
308     if (str2 == NULL) return(1);
309     do {
310         tmp = casemap[*str1++] - casemap[*str2];
311         if (tmp != 0 || --len == 0) return(tmp);
312     } while (*str2++ != 0);
313     return 0;
314 }
315 
316 /**
317  * xmlStrchr:
318  * @str:  the xmlChar * array
319  * @val:  the xmlChar to search
320  *
321  * a strchr for xmlChar's
322  *
323  * Returns the xmlChar * for the first occurrence or NULL.
324  */
325 
326 const xmlChar *
xmlStrchr(const xmlChar * str,xmlChar val)327 xmlStrchr(const xmlChar *str, xmlChar val) {
328     if (str == NULL) return(NULL);
329     while (*str != 0) { /* non input consuming */
330         if (*str == val) return((xmlChar *) str);
331         str++;
332     }
333     return(NULL);
334 }
335 
336 /**
337  * xmlStrstr:
338  * @str:  the xmlChar * array (haystack)
339  * @val:  the xmlChar to search (needle)
340  *
341  * a strstr for xmlChar's
342  *
343  * Returns the xmlChar * for the first occurrence or NULL.
344  */
345 
346 const xmlChar *
xmlStrstr(const xmlChar * str,const xmlChar * val)347 xmlStrstr(const xmlChar *str, const xmlChar *val) {
348     int n;
349 
350     if (str == NULL) return(NULL);
351     if (val == NULL) return(NULL);
352     n = xmlStrlen(val);
353 
354     if (n == 0) return(str);
355     while (*str != 0) { /* non input consuming */
356         if (*str == *val) {
357             if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
358         }
359         str++;
360     }
361     return(NULL);
362 }
363 
364 /**
365  * xmlStrcasestr:
366  * @str:  the xmlChar * array (haystack)
367  * @val:  the xmlChar to search (needle)
368  *
369  * a case-ignoring strstr for xmlChar's
370  *
371  * Returns the xmlChar * for the first occurrence or NULL.
372  */
373 
374 const xmlChar *
xmlStrcasestr(const xmlChar * str,const xmlChar * val)375 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
376     int n;
377 
378     if (str == NULL) return(NULL);
379     if (val == NULL) return(NULL);
380     n = xmlStrlen(val);
381 
382     if (n == 0) return(str);
383     while (*str != 0) { /* non input consuming */
384         if (casemap[*str] == casemap[*val])
385             if (!xmlStrncasecmp(str, val, n)) return(str);
386         str++;
387     }
388     return(NULL);
389 }
390 
391 /**
392  * xmlStrsub:
393  * @str:  the xmlChar * array (haystack)
394  * @start:  the index of the first char (zero based)
395  * @len:  the length of the substring
396  *
397  * Extract a substring of a given string
398  *
399  * Returns the xmlChar * for the first occurrence or NULL.
400  */
401 
402 xmlChar *
xmlStrsub(const xmlChar * str,int start,int len)403 xmlStrsub(const xmlChar *str, int start, int len) {
404     int i;
405 
406     if (str == NULL) return(NULL);
407     if (start < 0) return(NULL);
408     if (len < 0) return(NULL);
409 
410     for (i = 0;i < start;i++) {
411         if (*str == 0) return(NULL);
412         str++;
413     }
414     if (*str == 0) return(NULL);
415     return(xmlStrndup(str, len));
416 }
417 
418 /**
419  * xmlStrlen:
420  * @str:  the xmlChar * array
421  *
422  * length of a xmlChar's string
423  *
424  * Returns the number of xmlChar contained in the ARRAY.
425  */
426 
427 int
xmlStrlen(const xmlChar * str)428 xmlStrlen(const xmlChar *str) {
429     size_t len = str ? strlen((const char *)str) : 0;
430     return(len > INT_MAX ? 0 : len);
431 }
432 
433 /**
434  * xmlStrncat:
435  * @cur:  the original xmlChar * array
436  * @add:  the xmlChar * array added
437  * @len:  the length of @add
438  *
439  * a strncat for array of xmlChar's, it will extend @cur with the len
440  * first bytes of @add. Note that if @len < 0 then this is an API error
441  * and NULL will be returned.
442  *
443  * Returns a new xmlChar *, the original @cur is reallocated and should
444  * not be freed.
445  */
446 
447 xmlChar *
xmlStrncat(xmlChar * cur,const xmlChar * add,int len)448 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449     int size;
450     xmlChar *ret;
451 
452     if ((add == NULL) || (len == 0))
453         return(cur);
454     if (len < 0)
455 	return(NULL);
456     if (cur == NULL)
457         return(xmlStrndup(add, len));
458 
459     size = xmlStrlen(cur);
460     if ((size < 0) || (size > INT_MAX - len))
461         return(NULL);
462     ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
463     if (ret == NULL) {
464         return(cur);
465     }
466     memcpy(&ret[size], add, len);
467     ret[size + len] = 0;
468     return(ret);
469 }
470 
471 /**
472  * xmlStrncatNew:
473  * @str1:  first xmlChar string
474  * @str2:  second xmlChar string
475  * @len:  the len of @str2 or < 0
476  *
477  * same as xmlStrncat, but creates a new string.  The original
478  * two strings are not freed. If @len is < 0 then the length
479  * will be calculated automatically.
480  *
481  * Returns a new xmlChar * or NULL
482  */
483 xmlChar *
xmlStrncatNew(const xmlChar * str1,const xmlChar * str2,int len)484 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
485     int size;
486     xmlChar *ret;
487 
488     if (len < 0) {
489         len = xmlStrlen(str2);
490         if (len < 0)
491             return(NULL);
492     }
493     if ((str2 == NULL) || (len == 0))
494         return(xmlStrdup(str1));
495     if (str1 == NULL)
496         return(xmlStrndup(str2, len));
497 
498     size = xmlStrlen(str1);
499     if ((size < 0) || (size > INT_MAX - len))
500         return(NULL);
501     ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
502     if (ret == NULL) {
503         return(xmlStrndup(str1, size));
504     }
505     memcpy(ret, str1, size);
506     memcpy(&ret[size], str2, len);
507     ret[size + len] = 0;
508     return(ret);
509 }
510 
511 /**
512  * xmlStrcat:
513  * @cur:  the original xmlChar * array
514  * @add:  the xmlChar * array added
515  *
516  * a strcat for array of xmlChar's. Since they are supposed to be
517  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
518  * a termination mark of '0'.
519  *
520  * Returns a new xmlChar * containing the concatenated string. The original
521  * @cur is reallocated and should not be freed.
522  */
523 xmlChar *
xmlStrcat(xmlChar * cur,const xmlChar * add)524 xmlStrcat(xmlChar *cur, const xmlChar *add) {
525     const xmlChar *p = add;
526 
527     if (add == NULL) return(cur);
528     if (cur == NULL)
529         return(xmlStrdup(add));
530 
531     while (*p != 0) p++; /* non input consuming */
532     return(xmlStrncat(cur, add, p - add));
533 }
534 
535 /**
536  * xmlStrPrintf:
537  * @buf:   the result buffer.
538  * @len:   the result buffer length.
539  * @msg:   the message with printf formatting.
540  * @...:   extra parameters for the message.
541  *
542  * Formats @msg and places result into @buf.
543  *
544  * Returns the number of characters written to @buf or -1 if an error occurs.
545  */
546 int
xmlStrPrintf(xmlChar * buf,int len,const char * msg,...)547 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
548     va_list args;
549     int ret;
550 
551     if((buf == NULL) || (msg == NULL)) {
552         return(-1);
553     }
554 
555     va_start(args, msg);
556     ret = vsnprintf((char *) buf, len, (const char *) msg, args);
557     va_end(args);
558     buf[len - 1] = 0; /* be safe ! */
559 
560     return(ret);
561 }
562 
563 /**
564  * xmlStrVPrintf:
565  * @buf:   the result buffer.
566  * @len:   the result buffer length.
567  * @msg:   the message with printf formatting.
568  * @ap:    extra parameters for the message.
569  *
570  * Formats @msg and places result into @buf.
571  *
572  * Returns the number of characters written to @buf or -1 if an error occurs.
573  */
574 int
xmlStrVPrintf(xmlChar * buf,int len,const char * msg,va_list ap)575 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
576     int ret;
577 
578     if((buf == NULL) || (msg == NULL)) {
579         return(-1);
580     }
581 
582     ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
583     buf[len - 1] = 0; /* be safe ! */
584 
585     return(ret);
586 }
587 
588 /************************************************************************
589  *                                                                      *
590  *              Generic UTF8 handling routines                          *
591  *                                                                      *
592  * From rfc2044: encoding of the Unicode values on UTF-8:               *
593  *                                                                      *
594  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
595  * 0000 0000-0000 007F   0xxxxxxx                                       *
596  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
597  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
598  *                                                                      *
599  * I hope we won't use values > 0xFFFF anytime soon !                   *
600  *                                                                      *
601  ************************************************************************/
602 
603 
604 /**
605  * xmlUTF8Size:
606  * @utf: pointer to the UTF8 character
607  *
608  * calculates the internal size of a UTF8 character
609  *
610  * returns the numbers of bytes in the character, -1 on format error
611  */
612 int
xmlUTF8Size(const xmlChar * utf)613 xmlUTF8Size(const xmlChar *utf) {
614     xmlChar mask;
615     int len;
616 
617     if (utf == NULL)
618         return -1;
619     if (*utf < 0x80)
620         return 1;
621     /* check valid UTF8 character */
622     if (!(*utf & 0x40))
623         return -1;
624     /* determine number of bytes in char */
625     len = 2;
626     for (mask=0x20; mask != 0; mask>>=1) {
627         if (!(*utf & mask))
628             return len;
629         len++;
630     }
631     return -1;
632 }
633 
634 /**
635  * xmlUTF8Charcmp:
636  * @utf1: pointer to first UTF8 char
637  * @utf2: pointer to second UTF8 char
638  *
639  * compares the two UCS4 values
640  *
641  * returns result of the compare as with xmlStrncmp
642  */
643 int
xmlUTF8Charcmp(const xmlChar * utf1,const xmlChar * utf2)644 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
645 
646     if (utf1 == NULL ) {
647         if (utf2 == NULL)
648             return 0;
649         return -1;
650     }
651     return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
652 }
653 
654 /**
655  * xmlUTF8Strlen:
656  * @utf:  a sequence of UTF-8 encoded bytes
657  *
658  * compute the length of an UTF8 string, it doesn't do a full UTF8
659  * checking of the content of the string.
660  *
661  * Returns the number of characters in the string or -1 in case of error
662  */
663 int
xmlUTF8Strlen(const xmlChar * utf)664 xmlUTF8Strlen(const xmlChar *utf) {
665     size_t ret = 0;
666 
667     if (utf == NULL)
668         return(-1);
669 
670     while (*utf != 0) {
671         if (utf[0] & 0x80) {
672             if ((utf[1] & 0xc0) != 0x80)
673                 return(-1);
674             if ((utf[0] & 0xe0) == 0xe0) {
675                 if ((utf[2] & 0xc0) != 0x80)
676                     return(-1);
677                 if ((utf[0] & 0xf0) == 0xf0) {
678                     if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
679                         return(-1);
680                     utf += 4;
681                 } else {
682                     utf += 3;
683                 }
684             } else {
685                 utf += 2;
686             }
687         } else {
688             utf++;
689         }
690         ret++;
691     }
692     return(ret > INT_MAX ? 0 : ret);
693 }
694 
695 /**
696  * xmlGetUTF8Char:
697  * @utf:  a sequence of UTF-8 encoded bytes
698  * @len:  a pointer to the minimum number of bytes present in
699  *        the sequence.  This is used to assure the next character
700  *        is completely contained within the sequence.
701  *
702  * Read the first UTF8 character from @utf
703  *
704  * Returns the char value or -1 in case of error, and sets *len to
705  *        the actual number of bytes consumed (0 in case of error)
706  */
707 int
xmlGetUTF8Char(const unsigned char * utf,int * len)708 xmlGetUTF8Char(const unsigned char *utf, int *len) {
709     unsigned int c;
710 
711     if (utf == NULL)
712         goto error;
713     if (len == NULL)
714         goto error;
715     if (*len < 1)
716         goto error;
717 
718     c = utf[0];
719     if (c & 0x80) {
720         if (*len < 2)
721             goto error;
722         if ((utf[1] & 0xc0) != 0x80)
723             goto error;
724         if ((c & 0xe0) == 0xe0) {
725             if (*len < 3)
726                 goto error;
727             if ((utf[2] & 0xc0) != 0x80)
728                 goto error;
729             if ((c & 0xf0) == 0xf0) {
730                 if (*len < 4)
731                     goto error;
732                 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
733                     goto error;
734                 *len = 4;
735                 /* 4-byte code */
736                 c = (utf[0] & 0x7) << 18;
737                 c |= (utf[1] & 0x3f) << 12;
738                 c |= (utf[2] & 0x3f) << 6;
739                 c |= utf[3] & 0x3f;
740             } else {
741               /* 3-byte code */
742                 *len = 3;
743                 c = (utf[0] & 0xf) << 12;
744                 c |= (utf[1] & 0x3f) << 6;
745                 c |= utf[2] & 0x3f;
746             }
747         } else {
748           /* 2-byte code */
749             *len = 2;
750             c = (utf[0] & 0x1f) << 6;
751             c |= utf[1] & 0x3f;
752         }
753     } else {
754         /* 1-byte code */
755         *len = 1;
756     }
757     return(c);
758 
759 error:
760     if (len != NULL)
761 	*len = 0;
762     return(-1);
763 }
764 
765 /**
766  * xmlCheckUTF8:
767  * @utf: Pointer to putative UTF-8 encoded string.
768  *
769  * Checks @utf for being valid UTF-8. @utf is assumed to be
770  * null-terminated. This function is not super-strict, as it will
771  * allow longer UTF-8 sequences than necessary. Note that Java is
772  * capable of producing these sequences if provoked. Also note, this
773  * routine checks for the 4-byte maximum size, but does not check for
774  * 0x10ffff maximum value.
775  *
776  * Return value: true if @utf is valid.
777  **/
778 int
xmlCheckUTF8(const unsigned char * utf)779 xmlCheckUTF8(const unsigned char *utf)
780 {
781     int ix;
782     unsigned char c;
783 
784     if (utf == NULL)
785         return(0);
786     /*
787      * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
788      * are as follows (in "bit format"):
789      *    0xxxxxxx                                      valid 1-byte
790      *    110xxxxx 10xxxxxx                             valid 2-byte
791      *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
792      *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
793      */
794     while ((c = utf[0])) {      /* string is 0-terminated */
795         ix = 0;
796         if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
797             ix = 1;
798 	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
799 	    if ((utf[1] & 0xc0 ) != 0x80)
800 	        return 0;
801 	    ix = 2;
802 	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
803 	    if (((utf[1] & 0xc0) != 0x80) ||
804 	        ((utf[2] & 0xc0) != 0x80))
805 		    return 0;
806 	    ix = 3;
807 	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
808 	    if (((utf[1] & 0xc0) != 0x80) ||
809 	        ((utf[2] & 0xc0) != 0x80) ||
810 		((utf[3] & 0xc0) != 0x80))
811 		    return 0;
812 	    ix = 4;
813 	} else				/* unknown encoding */
814 	    return 0;
815         utf += ix;
816       }
817       return(1);
818 }
819 
820 /**
821  * xmlUTF8Strsize:
822  * @utf:  a sequence of UTF-8 encoded bytes
823  * @len:  the number of characters in the array
824  *
825  * storage size of an UTF8 string
826  * the behaviour is not guaranteed if the input string is not UTF-8
827  *
828  * Returns the storage size of
829  * the first 'len' characters of ARRAY
830  */
831 
832 int
xmlUTF8Strsize(const xmlChar * utf,int len)833 xmlUTF8Strsize(const xmlChar *utf, int len) {
834     const xmlChar *ptr=utf;
835     int ch;
836     size_t ret;
837 
838     if (utf == NULL)
839         return(0);
840 
841     if (len <= 0)
842         return(0);
843 
844     while ( len-- > 0) {
845         if ( !*ptr )
846             break;
847         if ( (ch = *ptr++) & 0x80)
848             while ((ch<<=1) & 0x80 ) {
849 		if (*ptr == 0) break;
850                 ptr++;
851 	    }
852     }
853     ret = ptr - utf;
854     return (ret > INT_MAX ? 0 : ret);
855 }
856 
857 
858 /**
859  * xmlUTF8Strndup:
860  * @utf:  the input UTF8 *
861  * @len:  the len of @utf (in chars)
862  *
863  * a strndup for array of UTF8's
864  *
865  * Returns a new UTF8 * or NULL
866  */
867 xmlChar *
xmlUTF8Strndup(const xmlChar * utf,int len)868 xmlUTF8Strndup(const xmlChar *utf, int len) {
869     xmlChar *ret;
870     int i;
871 
872     if ((utf == NULL) || (len < 0)) return(NULL);
873     i = xmlUTF8Strsize(utf, len);
874     ret = (xmlChar *) xmlMallocAtomic((size_t) i + 1);
875     if (ret == NULL) {
876         return(NULL);
877     }
878     memcpy(ret, utf, i);
879     ret[i] = 0;
880     return(ret);
881 }
882 
883 /**
884  * xmlUTF8Strpos:
885  * @utf:  the input UTF8 *
886  * @pos:  the position of the desired UTF8 char (in chars)
887  *
888  * a function to provide the equivalent of fetching a
889  * character from a string array
890  *
891  * Returns a pointer to the UTF8 character or NULL
892  */
893 const xmlChar *
xmlUTF8Strpos(const xmlChar * utf,int pos)894 xmlUTF8Strpos(const xmlChar *utf, int pos) {
895     int ch;
896 
897     if (utf == NULL) return(NULL);
898     if (pos < 0)
899         return(NULL);
900     while (pos--) {
901         if ((ch=*utf++) == 0) return(NULL);
902         if ( ch & 0x80 ) {
903             /* if not simple ascii, verify proper format */
904             if ( (ch & 0xc0) != 0xc0 )
905                 return(NULL);
906             /* then skip over remaining bytes for this char */
907             while ( (ch <<= 1) & 0x80 )
908                 if ( (*utf++ & 0xc0) != 0x80 )
909                     return(NULL);
910         }
911     }
912     return((xmlChar *)utf);
913 }
914 
915 /**
916  * xmlUTF8Strloc:
917  * @utf:  the input UTF8 *
918  * @utfchar:  the UTF8 character to be found
919  *
920  * a function to provide the relative location of a UTF8 char
921  *
922  * Returns the relative character position of the desired char
923  * or -1 if not found
924  */
925 int
xmlUTF8Strloc(const xmlChar * utf,const xmlChar * utfchar)926 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
927     size_t i;
928     int size;
929     int ch;
930 
931     if (utf==NULL || utfchar==NULL) return -1;
932     size = xmlUTF8Strsize(utfchar, 1);
933         for(i=0; (ch=*utf) != 0; i++) {
934             if (xmlStrncmp(utf, utfchar, size)==0)
935                 return(i > INT_MAX ? 0 : i);
936             utf++;
937             if ( ch & 0x80 ) {
938                 /* if not simple ascii, verify proper format */
939                 if ( (ch & 0xc0) != 0xc0 )
940                     return(-1);
941                 /* then skip over remaining bytes for this char */
942                 while ( (ch <<= 1) & 0x80 )
943                     if ( (*utf++ & 0xc0) != 0x80 )
944                         return(-1);
945             }
946         }
947 
948     return(-1);
949 }
950 /**
951  * xmlUTF8Strsub:
952  * @utf:  a sequence of UTF-8 encoded bytes
953  * @start: relative pos of first char
954  * @len:   total number to copy
955  *
956  * Create a substring from a given UTF-8 string
957  * Note:  positions are given in units of UTF-8 chars
958  *
959  * Returns a pointer to a newly created string
960  * or NULL if any problem
961  */
962 
963 xmlChar *
xmlUTF8Strsub(const xmlChar * utf,int start,int len)964 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
965     int i;
966     int ch;
967 
968     if (utf == NULL) return(NULL);
969     if (start < 0) return(NULL);
970     if (len < 0) return(NULL);
971 
972     /*
973      * Skip over any leading chars
974      */
975     for (i = 0;i < start;i++) {
976         if ((ch=*utf++) == 0) return(NULL);
977         if ( ch & 0x80 ) {
978             /* if not simple ascii, verify proper format */
979             if ( (ch & 0xc0) != 0xc0 )
980                 return(NULL);
981             /* then skip over remaining bytes for this char */
982             while ( (ch <<= 1) & 0x80 )
983                 if ( (*utf++ & 0xc0) != 0x80 )
984                     return(NULL);
985         }
986     }
987 
988     return(xmlUTF8Strndup(utf, len));
989 }
990 
991 /**
992  * xmlEscapeFormatString:
993  * @msg:  a pointer to the string in which to escape '%' characters.
994  * Must be a heap-allocated buffer created by libxml2 that may be
995  * returned, or that may be freed and replaced.
996  *
997  * Replaces the string pointed to by 'msg' with an escaped string.
998  * Returns the same string with all '%' characters escaped.
999  */
1000 xmlChar *
xmlEscapeFormatString(xmlChar ** msg)1001 xmlEscapeFormatString(xmlChar **msg)
1002 {
1003     xmlChar *msgPtr = NULL;
1004     xmlChar *result = NULL;
1005     xmlChar *resultPtr = NULL;
1006     size_t count = 0;
1007     size_t msgLen = 0;
1008     size_t resultLen = 0;
1009 
1010     if (!msg || !*msg)
1011         return(NULL);
1012 
1013     for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1014         ++msgLen;
1015         if (*msgPtr == '%')
1016             ++count;
1017     }
1018 
1019     if (count == 0)
1020         return(*msg);
1021 
1022     if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1023         return(NULL);
1024     resultLen = msgLen + count + 1;
1025     result = (xmlChar *) xmlMallocAtomic(resultLen);
1026     if (result == NULL) {
1027         /* Clear *msg to prevent format string vulnerabilities in
1028            out-of-memory situations. */
1029         xmlFree(*msg);
1030         *msg = NULL;
1031         return(NULL);
1032     }
1033 
1034     for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1035         *resultPtr = *msgPtr;
1036         if (*msgPtr == '%')
1037             *(++resultPtr) = '%';
1038     }
1039     result[resultLen - 1] = '\0';
1040 
1041     xmlFree(*msg);
1042     *msg = result;
1043 
1044     return *msg;
1045 }
1046 
1047