• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * string.c : an XML string utilities module
3  *
4  * This module provides various utility functions for manipulating
5  * the xmlChar* type. All functions named xmlStr* have been moved here
6  * from the parser.c file (their original home).
7  *
8  * See Copyright for the status of this software.
9  *
10  * UTF8 string routines from:
11  * William Brack <wbrack@mmm.com.hk>
12  *
13  * daniel@veillard.com
14  */
15 
16 #define IN_LIBXML
17 #include "libxml.h"
18 
19 #include <stdlib.h>
20 #include <string.h>
21 #include <limits.h>
22 #include <libxml/xmlmemory.h>
23 #include <libxml/parserInternals.h>
24 #include <libxml/xmlstring.h>
25 
26 /************************************************************************
27  *                                                                      *
28  *                Commodity functions to handle xmlChars                *
29  *                                                                      *
30  ************************************************************************/
31 
32 /**
33  * xmlStrndup:
34  * @cur:  the input xmlChar *
35  * @len:  the len of @cur
36  *
37  * a strndup for array of xmlChar's
38  *
39  * Returns a new xmlChar * or NULL
40  */
41 xmlChar *
xmlStrndup(const xmlChar * cur,int len)42 xmlStrndup(const xmlChar *cur, int len) {
43     xmlChar *ret;
44 
45     if ((cur == NULL) || (len < 0)) return(NULL);
46     ret = (xmlChar *) xmlMallocAtomic(((size_t) len + 1) * sizeof(xmlChar));
47     if (ret == NULL) {
48         xmlErrMemory(NULL, NULL);
49         return(NULL);
50     }
51     memcpy(ret, cur, len * sizeof(xmlChar));
52     ret[len] = 0;
53     return(ret);
54 }
55 
56 /**
57  * xmlStrdup:
58  * @cur:  the input xmlChar *
59  *
60  * a strdup for array of xmlChar's. Since they are supposed to be
61  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
62  * a termination mark of '0'.
63  *
64  * Returns a new xmlChar * or NULL
65  */
66 xmlChar *
xmlStrdup(const xmlChar * cur)67 xmlStrdup(const xmlChar *cur) {
68     const xmlChar *p = cur;
69 
70     if (cur == NULL) return(NULL);
71     while (*p != 0) p++; /* non input consuming */
72     return(xmlStrndup(cur, p - cur));
73 }
74 
75 /**
76  * xmlCharStrndup:
77  * @cur:  the input char *
78  * @len:  the len of @cur
79  *
80  * a strndup for char's to xmlChar's
81  *
82  * Returns a new xmlChar * or NULL
83  */
84 
85 xmlChar *
xmlCharStrndup(const char * cur,int len)86 xmlCharStrndup(const char *cur, int len) {
87     int i;
88     xmlChar *ret;
89 
90     if ((cur == NULL) || (len < 0)) return(NULL);
91     ret = (xmlChar *) xmlMallocAtomic(((size_t) len + 1) * sizeof(xmlChar));
92     if (ret == NULL) {
93         xmlErrMemory(NULL, NULL);
94         return(NULL);
95     }
96     for (i = 0;i < len;i++) {
97         ret[i] = (xmlChar) cur[i];
98         if (ret[i] == 0) return(ret);
99     }
100     ret[len] = 0;
101     return(ret);
102 }
103 
104 /**
105  * xmlCharStrdup:
106  * @cur:  the input char *
107  *
108  * a strdup for char's to xmlChar's
109  *
110  * Returns a new xmlChar * or NULL
111  */
112 
113 xmlChar *
xmlCharStrdup(const char * cur)114 xmlCharStrdup(const char *cur) {
115     const char *p = cur;
116 
117     if (cur == NULL) return(NULL);
118     while (*p != '\0') p++; /* non input consuming */
119     return(xmlCharStrndup(cur, p - cur));
120 }
121 
122 /**
123  * xmlStrcmp:
124  * @str1:  the first xmlChar *
125  * @str2:  the second xmlChar *
126  *
127  * a strcmp for xmlChar's
128  *
129  * Returns the integer result of the comparison
130  */
131 
132 int
xmlStrcmp(const xmlChar * str1,const xmlChar * str2)133 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
134     if (str1 == str2) return(0);
135     if (str1 == NULL) return(-1);
136     if (str2 == NULL) return(1);
137 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
138     return(strcmp((const char *)str1, (const char *)str2));
139 #else
140     do {
141         int tmp = *str1++ - *str2;
142         if (tmp != 0) return(tmp);
143     } while (*str2++ != 0);
144     return 0;
145 #endif
146 }
147 
148 /**
149  * xmlStrEqual:
150  * @str1:  the first xmlChar *
151  * @str2:  the second xmlChar *
152  *
153  * Check if both strings are equal of have same content.
154  * Should be a bit more readable and faster than xmlStrcmp()
155  *
156  * Returns 1 if they are equal, 0 if they are different
157  */
158 
159 int
xmlStrEqual(const xmlChar * str1,const xmlChar * str2)160 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
161     if (str1 == str2) return(1);
162     if (str1 == NULL) return(0);
163     if (str2 == NULL) return(0);
164 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
165     return(strcmp((const char *)str1, (const char *)str2) == 0);
166 #else
167     do {
168         if (*str1++ != *str2) return(0);
169     } while (*str2++);
170     return(1);
171 #endif
172 }
173 
174 /**
175  * xmlStrQEqual:
176  * @pref:  the prefix of the QName
177  * @name:  the localname of the QName
178  * @str:  the second xmlChar *
179  *
180  * Check if a QName is Equal to a given string
181  *
182  * Returns 1 if they are equal, 0 if they are different
183  */
184 
185 int
xmlStrQEqual(const xmlChar * pref,const xmlChar * name,const xmlChar * str)186 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
187     if (pref == NULL) return(xmlStrEqual(name, str));
188     if (name == NULL) return(0);
189     if (str == NULL) return(0);
190 
191     do {
192         if (*pref++ != *str) return(0);
193     } while ((*str++) && (*pref));
194     if (*str++ != ':') return(0);
195     do {
196         if (*name++ != *str) return(0);
197     } while (*str++);
198     return(1);
199 }
200 
201 /**
202  * xmlStrncmp:
203  * @str1:  the first xmlChar *
204  * @str2:  the second xmlChar *
205  * @len:  the max comparison length
206  *
207  * a strncmp for xmlChar's
208  *
209  * Returns the integer result of the comparison
210  */
211 
212 int
xmlStrncmp(const xmlChar * str1,const xmlChar * str2,int len)213 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
214     if (len <= 0) return(0);
215     if (str1 == str2) return(0);
216     if (str1 == NULL) return(-1);
217     if (str2 == NULL) return(1);
218 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
219     return(strncmp((const char *)str1, (const char *)str2, len));
220 #else
221     do {
222         int tmp = *str1++ - *str2;
223         if (tmp != 0 || --len == 0) return(tmp);
224     } while (*str2++ != 0);
225     return 0;
226 #endif
227 }
228 
229 static const xmlChar casemap[256] = {
230     0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
231     0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
232     0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
233     0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
234     0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
235     0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
236     0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
237     0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
238     0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241     0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
242     0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
243     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
244     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
245     0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
246     0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
247     0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
248     0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
249     0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
250     0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
251     0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
252     0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
253     0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
254     0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
255     0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
256     0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
257     0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
258     0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
259     0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
260     0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
261     0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
262 };
263 
264 /**
265  * xmlStrcasecmp:
266  * @str1:  the first xmlChar *
267  * @str2:  the second xmlChar *
268  *
269  * a strcasecmp for xmlChar's
270  *
271  * Returns the integer result of the comparison
272  */
273 
274 int
xmlStrcasecmp(const xmlChar * str1,const xmlChar * str2)275 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
276     register int tmp;
277 
278     if (str1 == str2) return(0);
279     if (str1 == NULL) return(-1);
280     if (str2 == NULL) return(1);
281     do {
282         tmp = casemap[*str1++] - casemap[*str2];
283         if (tmp != 0) return(tmp);
284     } while (*str2++ != 0);
285     return 0;
286 }
287 
288 /**
289  * xmlStrncasecmp:
290  * @str1:  the first xmlChar *
291  * @str2:  the second xmlChar *
292  * @len:  the max comparison length
293  *
294  * a strncasecmp for xmlChar's
295  *
296  * Returns the integer result of the comparison
297  */
298 
299 int
xmlStrncasecmp(const xmlChar * str1,const xmlChar * str2,int len)300 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
301     register int tmp;
302 
303     if (len <= 0) return(0);
304     if (str1 == str2) return(0);
305     if (str1 == NULL) return(-1);
306     if (str2 == NULL) return(1);
307     do {
308         tmp = casemap[*str1++] - casemap[*str2];
309         if (tmp != 0 || --len == 0) return(tmp);
310     } while (*str2++ != 0);
311     return 0;
312 }
313 
314 /**
315  * xmlStrchr:
316  * @str:  the xmlChar * array
317  * @val:  the xmlChar to search
318  *
319  * a strchr for xmlChar's
320  *
321  * Returns the xmlChar * for the first occurrence or NULL.
322  */
323 
324 const xmlChar *
xmlStrchr(const xmlChar * str,xmlChar val)325 xmlStrchr(const xmlChar *str, xmlChar val) {
326     if (str == NULL) return(NULL);
327     while (*str != 0) { /* non input consuming */
328         if (*str == val) return((xmlChar *) str);
329         str++;
330     }
331     return(NULL);
332 }
333 
334 /**
335  * xmlStrstr:
336  * @str:  the xmlChar * array (haystack)
337  * @val:  the xmlChar to search (needle)
338  *
339  * a strstr for xmlChar's
340  *
341  * Returns the xmlChar * for the first occurrence or NULL.
342  */
343 
344 const xmlChar *
xmlStrstr(const xmlChar * str,const xmlChar * val)345 xmlStrstr(const xmlChar *str, const xmlChar *val) {
346     int n;
347 
348     if (str == NULL) return(NULL);
349     if (val == NULL) return(NULL);
350     n = xmlStrlen(val);
351 
352     if (n == 0) return(str);
353     while (*str != 0) { /* non input consuming */
354         if (*str == *val) {
355             if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
356         }
357         str++;
358     }
359     return(NULL);
360 }
361 
362 /**
363  * xmlStrcasestr:
364  * @str:  the xmlChar * array (haystack)
365  * @val:  the xmlChar to search (needle)
366  *
367  * a case-ignoring strstr for xmlChar's
368  *
369  * Returns the xmlChar * for the first occurrence or NULL.
370  */
371 
372 const xmlChar *
xmlStrcasestr(const xmlChar * str,const xmlChar * val)373 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
374     int n;
375 
376     if (str == NULL) return(NULL);
377     if (val == NULL) return(NULL);
378     n = xmlStrlen(val);
379 
380     if (n == 0) return(str);
381     while (*str != 0) { /* non input consuming */
382         if (casemap[*str] == casemap[*val])
383             if (!xmlStrncasecmp(str, val, n)) return(str);
384         str++;
385     }
386     return(NULL);
387 }
388 
389 /**
390  * xmlStrsub:
391  * @str:  the xmlChar * array (haystack)
392  * @start:  the index of the first char (zero based)
393  * @len:  the length of the substring
394  *
395  * Extract a substring of a given string
396  *
397  * Returns the xmlChar * for the first occurrence or NULL.
398  */
399 
400 xmlChar *
xmlStrsub(const xmlChar * str,int start,int len)401 xmlStrsub(const xmlChar *str, int start, int len) {
402     int i;
403 
404     if (str == NULL) return(NULL);
405     if (start < 0) return(NULL);
406     if (len < 0) return(NULL);
407 
408     for (i = 0;i < start;i++) {
409         if (*str == 0) return(NULL);
410         str++;
411     }
412     if (*str == 0) return(NULL);
413     return(xmlStrndup(str, len));
414 }
415 
416 /**
417  * xmlStrlen:
418  * @str:  the xmlChar * array
419  *
420  * length of a xmlChar's string
421  *
422  * Returns the number of xmlChar contained in the ARRAY.
423  */
424 
425 int
xmlStrlen(const xmlChar * str)426 xmlStrlen(const xmlChar *str) {
427     size_t len = 0;
428 
429     if (str == NULL) return(0);
430     while (*str != 0) { /* non input consuming */
431         str++;
432         len++;
433     }
434     return(len > INT_MAX ? 0 : len);
435 }
436 
437 /**
438  * xmlStrncat:
439  * @cur:  the original xmlChar * array
440  * @add:  the xmlChar * array added
441  * @len:  the length of @add
442  *
443  * a strncat for array of xmlChar's, it will extend @cur with the len
444  * first bytes of @add. Note that if @len < 0 then this is an API error
445  * and NULL will be returned.
446  *
447  * Returns a new xmlChar *, the original @cur is reallocated and should
448  * not be freed.
449  */
450 
451 xmlChar *
xmlStrncat(xmlChar * cur,const xmlChar * add,int len)452 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
453     int size;
454     xmlChar *ret;
455 
456     if ((add == NULL) || (len == 0))
457         return(cur);
458     if (len < 0)
459 	return(NULL);
460     if (cur == NULL)
461         return(xmlStrndup(add, len));
462 
463     size = xmlStrlen(cur);
464     if ((size < 0) || (size > INT_MAX - len))
465         return(NULL);
466     ret = (xmlChar *) xmlRealloc(cur, ((size_t) size + len + 1) * sizeof(xmlChar));
467     if (ret == NULL) {
468         xmlErrMemory(NULL, NULL);
469         return(cur);
470     }
471     memcpy(&ret[size], add, len * sizeof(xmlChar));
472     ret[size + len] = 0;
473     return(ret);
474 }
475 
476 /**
477  * xmlStrncatNew:
478  * @str1:  first xmlChar string
479  * @str2:  second xmlChar string
480  * @len:  the len of @str2 or < 0
481  *
482  * same as xmlStrncat, but creates a new string.  The original
483  * two strings are not freed. If @len is < 0 then the length
484  * will be calculated automatically.
485  *
486  * Returns a new xmlChar * or NULL
487  */
488 xmlChar *
xmlStrncatNew(const xmlChar * str1,const xmlChar * str2,int len)489 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
490     int size;
491     xmlChar *ret;
492 
493     if (len < 0) {
494         len = xmlStrlen(str2);
495         if (len < 0)
496             return(NULL);
497     }
498     if ((str2 == NULL) || (len == 0))
499         return(xmlStrdup(str1));
500     if (str1 == NULL)
501         return(xmlStrndup(str2, len));
502 
503     size = xmlStrlen(str1);
504     if ((size < 0) || (size > INT_MAX - len))
505         return(NULL);
506     ret = (xmlChar *) xmlMalloc(((size_t) size + len + 1) * sizeof(xmlChar));
507     if (ret == NULL) {
508         xmlErrMemory(NULL, NULL);
509         return(xmlStrndup(str1, size));
510     }
511     memcpy(ret, str1, size * sizeof(xmlChar));
512     memcpy(&ret[size], str2, len * sizeof(xmlChar));
513     ret[size + len] = 0;
514     return(ret);
515 }
516 
517 /**
518  * xmlStrcat:
519  * @cur:  the original xmlChar * array
520  * @add:  the xmlChar * array added
521  *
522  * a strcat for array of xmlChar's. Since they are supposed to be
523  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
524  * a termination mark of '0'.
525  *
526  * Returns a new xmlChar * containing the concatenated string. The original
527  * @cur is reallocated and should not be freed.
528  */
529 xmlChar *
xmlStrcat(xmlChar * cur,const xmlChar * add)530 xmlStrcat(xmlChar *cur, const xmlChar *add) {
531     const xmlChar *p = add;
532 
533     if (add == NULL) return(cur);
534     if (cur == NULL)
535         return(xmlStrdup(add));
536 
537     while (*p != 0) p++; /* non input consuming */
538     return(xmlStrncat(cur, add, p - add));
539 }
540 
541 /**
542  * xmlStrPrintf:
543  * @buf:   the result buffer.
544  * @len:   the result buffer length.
545  * @msg:   the message with printf formatting.
546  * @...:   extra parameters for the message.
547  *
548  * Formats @msg and places result into @buf.
549  *
550  * Returns the number of characters written to @buf or -1 if an error occurs.
551  */
552 int XMLCDECL
xmlStrPrintf(xmlChar * buf,int len,const char * msg,...)553 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
554     va_list args;
555     int ret;
556 
557     if((buf == NULL) || (msg == NULL)) {
558         return(-1);
559     }
560 
561     va_start(args, msg);
562     ret = vsnprintf((char *) buf, len, (const char *) msg, args);
563     va_end(args);
564     buf[len - 1] = 0; /* be safe ! */
565 
566     return(ret);
567 }
568 
569 /**
570  * xmlStrVPrintf:
571  * @buf:   the result buffer.
572  * @len:   the result buffer length.
573  * @msg:   the message with printf formatting.
574  * @ap:    extra parameters for the message.
575  *
576  * Formats @msg and places result into @buf.
577  *
578  * Returns the number of characters written to @buf or -1 if an error occurs.
579  */
580 int
xmlStrVPrintf(xmlChar * buf,int len,const char * msg,va_list ap)581 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
582     int ret;
583 
584     if((buf == NULL) || (msg == NULL)) {
585         return(-1);
586     }
587 
588     ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
589     buf[len - 1] = 0; /* be safe ! */
590 
591     return(ret);
592 }
593 
594 /************************************************************************
595  *                                                                      *
596  *              Generic UTF8 handling routines                          *
597  *                                                                      *
598  * From rfc2044: encoding of the Unicode values on UTF-8:               *
599  *                                                                      *
600  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
601  * 0000 0000-0000 007F   0xxxxxxx                                       *
602  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
603  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
604  *                                                                      *
605  * I hope we won't use values > 0xFFFF anytime soon !                   *
606  *                                                                      *
607  ************************************************************************/
608 
609 
610 /**
611  * xmlUTF8Size:
612  * @utf: pointer to the UTF8 character
613  *
614  * calculates the internal size of a UTF8 character
615  *
616  * returns the numbers of bytes in the character, -1 on format error
617  */
618 int
xmlUTF8Size(const xmlChar * utf)619 xmlUTF8Size(const xmlChar *utf) {
620     xmlChar mask;
621     int len;
622 
623     if (utf == NULL)
624         return -1;
625     if (*utf < 0x80)
626         return 1;
627     /* check valid UTF8 character */
628     if (!(*utf & 0x40))
629         return -1;
630     /* determine number of bytes in char */
631     len = 2;
632     for (mask=0x20; mask != 0; mask>>=1) {
633         if (!(*utf & mask))
634             return len;
635         len++;
636     }
637     return -1;
638 }
639 
640 /**
641  * xmlUTF8Charcmp:
642  * @utf1: pointer to first UTF8 char
643  * @utf2: pointer to second UTF8 char
644  *
645  * compares the two UCS4 values
646  *
647  * returns result of the compare as with xmlStrncmp
648  */
649 int
xmlUTF8Charcmp(const xmlChar * utf1,const xmlChar * utf2)650 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
651 
652     if (utf1 == NULL ) {
653         if (utf2 == NULL)
654             return 0;
655         return -1;
656     }
657     return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
658 }
659 
660 /**
661  * xmlUTF8Strlen:
662  * @utf:  a sequence of UTF-8 encoded bytes
663  *
664  * compute the length of an UTF8 string, it doesn't do a full UTF8
665  * checking of the content of the string.
666  *
667  * Returns the number of characters in the string or -1 in case of error
668  */
669 int
xmlUTF8Strlen(const xmlChar * utf)670 xmlUTF8Strlen(const xmlChar *utf) {
671     size_t ret = 0;
672 
673     if (utf == NULL)
674         return(-1);
675 
676     while (*utf != 0) {
677         if (utf[0] & 0x80) {
678             if ((utf[1] & 0xc0) != 0x80)
679                 return(-1);
680             if ((utf[0] & 0xe0) == 0xe0) {
681                 if ((utf[2] & 0xc0) != 0x80)
682                     return(-1);
683                 if ((utf[0] & 0xf0) == 0xf0) {
684                     if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
685                         return(-1);
686                     utf += 4;
687                 } else {
688                     utf += 3;
689                 }
690             } else {
691                 utf += 2;
692             }
693         } else {
694             utf++;
695         }
696         ret++;
697     }
698     return(ret > INT_MAX ? 0 : ret);
699 }
700 
701 /**
702  * xmlGetUTF8Char:
703  * @utf:  a sequence of UTF-8 encoded bytes
704  * @len:  a pointer to the minimum number of bytes present in
705  *        the sequence.  This is used to assure the next character
706  *        is completely contained within the sequence.
707  *
708  * Read the first UTF8 character from @utf
709  *
710  * Returns the char value or -1 in case of error, and sets *len to
711  *        the actual number of bytes consumed (0 in case of error)
712  */
713 int
xmlGetUTF8Char(const unsigned char * utf,int * len)714 xmlGetUTF8Char(const unsigned char *utf, int *len) {
715     unsigned int c;
716 
717     if (utf == NULL)
718         goto error;
719     if (len == NULL)
720         goto error;
721     if (*len < 1)
722         goto error;
723 
724     c = utf[0];
725     if (c & 0x80) {
726         if (*len < 2)
727             goto error;
728         if ((utf[1] & 0xc0) != 0x80)
729             goto error;
730         if ((c & 0xe0) == 0xe0) {
731             if (*len < 3)
732                 goto error;
733             if ((utf[2] & 0xc0) != 0x80)
734                 goto error;
735             if ((c & 0xf0) == 0xf0) {
736                 if (*len < 4)
737                     goto error;
738                 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
739                     goto error;
740                 *len = 4;
741                 /* 4-byte code */
742                 c = (utf[0] & 0x7) << 18;
743                 c |= (utf[1] & 0x3f) << 12;
744                 c |= (utf[2] & 0x3f) << 6;
745                 c |= utf[3] & 0x3f;
746             } else {
747               /* 3-byte code */
748                 *len = 3;
749                 c = (utf[0] & 0xf) << 12;
750                 c |= (utf[1] & 0x3f) << 6;
751                 c |= utf[2] & 0x3f;
752             }
753         } else {
754           /* 2-byte code */
755             *len = 2;
756             c = (utf[0] & 0x1f) << 6;
757             c |= utf[1] & 0x3f;
758         }
759     } else {
760         /* 1-byte code */
761         *len = 1;
762     }
763     return(c);
764 
765 error:
766     if (len != NULL)
767 	*len = 0;
768     return(-1);
769 }
770 
771 /**
772  * xmlCheckUTF8:
773  * @utf: Pointer to putative UTF-8 encoded string.
774  *
775  * Checks @utf for being valid UTF-8. @utf is assumed to be
776  * null-terminated. This function is not super-strict, as it will
777  * allow longer UTF-8 sequences than necessary. Note that Java is
778  * capable of producing these sequences if provoked. Also note, this
779  * routine checks for the 4-byte maximum size, but does not check for
780  * 0x10ffff maximum value.
781  *
782  * Return value: true if @utf is valid.
783  **/
784 int
xmlCheckUTF8(const unsigned char * utf)785 xmlCheckUTF8(const unsigned char *utf)
786 {
787     int ix;
788     unsigned char c;
789 
790     if (utf == NULL)
791         return(0);
792     /*
793      * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
794      * are as follows (in "bit format"):
795      *    0xxxxxxx                                      valid 1-byte
796      *    110xxxxx 10xxxxxx                             valid 2-byte
797      *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
798      *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
799      */
800     while ((c = utf[0])) {      /* string is 0-terminated */
801         ix = 0;
802         if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
803             ix = 1;
804 	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
805 	    if ((utf[1] & 0xc0 ) != 0x80)
806 	        return 0;
807 	    ix = 2;
808 	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
809 	    if (((utf[1] & 0xc0) != 0x80) ||
810 	        ((utf[2] & 0xc0) != 0x80))
811 		    return 0;
812 	    ix = 3;
813 	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
814 	    if (((utf[1] & 0xc0) != 0x80) ||
815 	        ((utf[2] & 0xc0) != 0x80) ||
816 		((utf[3] & 0xc0) != 0x80))
817 		    return 0;
818 	    ix = 4;
819 	} else				/* unknown encoding */
820 	    return 0;
821         utf += ix;
822       }
823       return(1);
824 }
825 
826 /**
827  * xmlUTF8Strsize:
828  * @utf:  a sequence of UTF-8 encoded bytes
829  * @len:  the number of characters in the array
830  *
831  * storage size of an UTF8 string
832  * the behaviour is not guaranteed if the input string is not UTF-8
833  *
834  * Returns the storage size of
835  * the first 'len' characters of ARRAY
836  */
837 
838 int
xmlUTF8Strsize(const xmlChar * utf,int len)839 xmlUTF8Strsize(const xmlChar *utf, int len) {
840     const xmlChar *ptr=utf;
841     int ch;
842     size_t ret;
843 
844     if (utf == NULL)
845         return(0);
846 
847     if (len <= 0)
848         return(0);
849 
850     while ( len-- > 0) {
851         if ( !*ptr )
852             break;
853         if ( (ch = *ptr++) & 0x80)
854             while ((ch<<=1) & 0x80 ) {
855 		if (*ptr == 0) break;
856                 ptr++;
857 	    }
858     }
859     ret = ptr - utf;
860     return (ret > INT_MAX ? 0 : ret);
861 }
862 
863 
864 /**
865  * xmlUTF8Strndup:
866  * @utf:  the input UTF8 *
867  * @len:  the len of @utf (in chars)
868  *
869  * a strndup for array of UTF8's
870  *
871  * Returns a new UTF8 * or NULL
872  */
873 xmlChar *
xmlUTF8Strndup(const xmlChar * utf,int len)874 xmlUTF8Strndup(const xmlChar *utf, int len) {
875     xmlChar *ret;
876     int i;
877 
878     if ((utf == NULL) || (len < 0)) return(NULL);
879     i = xmlUTF8Strsize(utf, len);
880     ret = (xmlChar *) xmlMallocAtomic(((size_t) i + 1) * sizeof(xmlChar));
881     if (ret == NULL) {
882         return(NULL);
883     }
884     memcpy(ret, utf, i * sizeof(xmlChar));
885     ret[i] = 0;
886     return(ret);
887 }
888 
889 /**
890  * xmlUTF8Strpos:
891  * @utf:  the input UTF8 *
892  * @pos:  the position of the desired UTF8 char (in chars)
893  *
894  * a function to provide the equivalent of fetching a
895  * character from a string array
896  *
897  * Returns a pointer to the UTF8 character or NULL
898  */
899 const xmlChar *
xmlUTF8Strpos(const xmlChar * utf,int pos)900 xmlUTF8Strpos(const xmlChar *utf, int pos) {
901     int ch;
902 
903     if (utf == NULL) return(NULL);
904     if (pos < 0)
905         return(NULL);
906     while (pos--) {
907         if ((ch=*utf++) == 0) return(NULL);
908         if ( ch & 0x80 ) {
909             /* if not simple ascii, verify proper format */
910             if ( (ch & 0xc0) != 0xc0 )
911                 return(NULL);
912             /* then skip over remaining bytes for this char */
913             while ( (ch <<= 1) & 0x80 )
914                 if ( (*utf++ & 0xc0) != 0x80 )
915                     return(NULL);
916         }
917     }
918     return((xmlChar *)utf);
919 }
920 
921 /**
922  * xmlUTF8Strloc:
923  * @utf:  the input UTF8 *
924  * @utfchar:  the UTF8 character to be found
925  *
926  * a function to provide the relative location of a UTF8 char
927  *
928  * Returns the relative character position of the desired char
929  * or -1 if not found
930  */
931 int
xmlUTF8Strloc(const xmlChar * utf,const xmlChar * utfchar)932 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
933     size_t i;
934     int size;
935     int ch;
936 
937     if (utf==NULL || utfchar==NULL) return -1;
938     size = xmlUTF8Strsize(utfchar, 1);
939         for(i=0; (ch=*utf) != 0; i++) {
940             if (xmlStrncmp(utf, utfchar, size)==0)
941                 return(i > INT_MAX ? 0 : i);
942             utf++;
943             if ( ch & 0x80 ) {
944                 /* if not simple ascii, verify proper format */
945                 if ( (ch & 0xc0) != 0xc0 )
946                     return(-1);
947                 /* then skip over remaining bytes for this char */
948                 while ( (ch <<= 1) & 0x80 )
949                     if ( (*utf++ & 0xc0) != 0x80 )
950                         return(-1);
951             }
952         }
953 
954     return(-1);
955 }
956 /**
957  * xmlUTF8Strsub:
958  * @utf:  a sequence of UTF-8 encoded bytes
959  * @start: relative pos of first char
960  * @len:   total number to copy
961  *
962  * Create a substring from a given UTF-8 string
963  * Note:  positions are given in units of UTF-8 chars
964  *
965  * Returns a pointer to a newly created string
966  * or NULL if any problem
967  */
968 
969 xmlChar *
xmlUTF8Strsub(const xmlChar * utf,int start,int len)970 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
971     int i;
972     int ch;
973 
974     if (utf == NULL) return(NULL);
975     if (start < 0) return(NULL);
976     if (len < 0) return(NULL);
977 
978     /*
979      * Skip over any leading chars
980      */
981     for (i = 0;i < start;i++) {
982         if ((ch=*utf++) == 0) return(NULL);
983         if ( ch & 0x80 ) {
984             /* if not simple ascii, verify proper format */
985             if ( (ch & 0xc0) != 0xc0 )
986                 return(NULL);
987             /* then skip over remaining bytes for this char */
988             while ( (ch <<= 1) & 0x80 )
989                 if ( (*utf++ & 0xc0) != 0x80 )
990                     return(NULL);
991         }
992     }
993 
994     return(xmlUTF8Strndup(utf, len));
995 }
996 
997 /**
998  * xmlEscapeFormatString:
999  * @msg:  a pointer to the string in which to escape '%' characters.
1000  * Must be a heap-allocated buffer created by libxml2 that may be
1001  * returned, or that may be freed and replaced.
1002  *
1003  * Replaces the string pointed to by 'msg' with an escaped string.
1004  * Returns the same string with all '%' characters escaped.
1005  */
1006 xmlChar *
xmlEscapeFormatString(xmlChar ** msg)1007 xmlEscapeFormatString(xmlChar **msg)
1008 {
1009     xmlChar *msgPtr = NULL;
1010     xmlChar *result = NULL;
1011     xmlChar *resultPtr = NULL;
1012     size_t count = 0;
1013     size_t msgLen = 0;
1014     size_t resultLen = 0;
1015 
1016     if (!msg || !*msg)
1017         return(NULL);
1018 
1019     for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1020         ++msgLen;
1021         if (*msgPtr == '%')
1022             ++count;
1023     }
1024 
1025     if (count == 0)
1026         return(*msg);
1027 
1028     if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1029         return(NULL);
1030     resultLen = msgLen + count + 1;
1031     result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1032     if (result == NULL) {
1033         /* Clear *msg to prevent format string vulnerabilities in
1034            out-of-memory situations. */
1035         xmlFree(*msg);
1036         *msg = NULL;
1037         xmlErrMemory(NULL, NULL);
1038         return(NULL);
1039     }
1040 
1041     for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1042         *resultPtr = *msgPtr;
1043         if (*msgPtr == '%')
1044             *(++resultPtr) = '%';
1045     }
1046     result[resultLen - 1] = '\0';
1047 
1048     xmlFree(*msg);
1049     *msg = result;
1050 
1051     return *msg;
1052 }
1053 
1054 #define bottom_xmlstring
1055 #include "elfgcchack.h"
1056