• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * string.c : an XML string utilities module
3  *
4  * This module provides various utility functions for manipulating
5  * the xmlChar* type. All functions named xmlStr* have been moved here
6  * from the parser.c file (their original home).
7  *
8  * See Copyright for the status of this software.
9  *
10  * UTF8 string routines from:
11  * William Brack <wbrack@mmm.com.hk>
12  *
13  * daniel@veillard.com
14  */
15 
16 #define IN_LIBXML
17 #include "libxml.h"
18 
19 #include <stdlib.h>
20 #include <string.h>
21 #include <libxml/xmlmemory.h>
22 #include <libxml/parserInternals.h>
23 #include <libxml/xmlstring.h>
24 
25 /************************************************************************
26  *                                                                      *
27  *                Commodity functions to handle xmlChars                *
28  *                                                                      *
29  ************************************************************************/
30 
31 /**
32  * xmlStrndup:
33  * @cur:  the input xmlChar *
34  * @len:  the len of @cur
35  *
36  * a strndup for array of xmlChar's
37  *
38  * Returns a new xmlChar * or NULL
39  */
40 xmlChar *
xmlStrndup(const xmlChar * cur,int len)41 xmlStrndup(const xmlChar *cur, int len) {
42     xmlChar *ret;
43 
44     if ((cur == NULL) || (len < 0)) return(NULL);
45     ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46     if (ret == NULL) {
47         xmlErrMemory(NULL, NULL);
48         return(NULL);
49     }
50     memcpy(ret, cur, len * sizeof(xmlChar));
51     ret[len] = 0;
52     return(ret);
53 }
54 
55 /**
56  * xmlStrdup:
57  * @cur:  the input xmlChar *
58  *
59  * a strdup for array of xmlChar's. Since they are supposed to be
60  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61  * a termination mark of '0'.
62  *
63  * Returns a new xmlChar * or NULL
64  */
65 xmlChar *
xmlStrdup(const xmlChar * cur)66 xmlStrdup(const xmlChar *cur) {
67     const xmlChar *p = cur;
68 
69     if (cur == NULL) return(NULL);
70     while (*p != 0) p++; /* non input consuming */
71     return(xmlStrndup(cur, p - cur));
72 }
73 
74 /**
75  * xmlCharStrndup:
76  * @cur:  the input char *
77  * @len:  the len of @cur
78  *
79  * a strndup for char's to xmlChar's
80  *
81  * Returns a new xmlChar * or NULL
82  */
83 
84 xmlChar *
xmlCharStrndup(const char * cur,int len)85 xmlCharStrndup(const char *cur, int len) {
86     int i;
87     xmlChar *ret;
88 
89     if ((cur == NULL) || (len < 0)) return(NULL);
90     ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91     if (ret == NULL) {
92         xmlErrMemory(NULL, NULL);
93         return(NULL);
94     }
95     for (i = 0;i < len;i++) {
96         ret[i] = (xmlChar) cur[i];
97         if (ret[i] == 0) return(ret);
98     }
99     ret[len] = 0;
100     return(ret);
101 }
102 
103 /**
104  * xmlCharStrdup:
105  * @cur:  the input char *
106  *
107  * a strdup for char's to xmlChar's
108  *
109  * Returns a new xmlChar * or NULL
110  */
111 
112 xmlChar *
xmlCharStrdup(const char * cur)113 xmlCharStrdup(const char *cur) {
114     const char *p = cur;
115 
116     if (cur == NULL) return(NULL);
117     while (*p != '\0') p++; /* non input consuming */
118     return(xmlCharStrndup(cur, p - cur));
119 }
120 
121 /**
122  * xmlStrcmp:
123  * @str1:  the first xmlChar *
124  * @str2:  the second xmlChar *
125  *
126  * a strcmp for xmlChar's
127  *
128  * Returns the integer result of the comparison
129  */
130 
131 int
xmlStrcmp(const xmlChar * str1,const xmlChar * str2)132 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133     if (str1 == str2) return(0);
134     if (str1 == NULL) return(-1);
135     if (str2 == NULL) return(1);
136 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
137     return(strcmp((const char *)str1, (const char *)str2));
138 #else
139     do {
140         int tmp = *str1++ - *str2;
141         if (tmp != 0) return(tmp);
142     } while (*str2++ != 0);
143     return 0;
144 #endif
145 }
146 
147 /**
148  * xmlStrEqual:
149  * @str1:  the first xmlChar *
150  * @str2:  the second xmlChar *
151  *
152  * Check if both strings are equal of have same content.
153  * Should be a bit more readable and faster than xmlStrcmp()
154  *
155  * Returns 1 if they are equal, 0 if they are different
156  */
157 
158 int
xmlStrEqual(const xmlChar * str1,const xmlChar * str2)159 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
160     if (str1 == str2) return(1);
161     if (str1 == NULL) return(0);
162     if (str2 == NULL) return(0);
163 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
164     return(strcmp((const char *)str1, (const char *)str2) == 0);
165 #else
166     do {
167         if (*str1++ != *str2) return(0);
168     } while (*str2++);
169     return(1);
170 #endif
171 }
172 
173 /**
174  * xmlStrQEqual:
175  * @pref:  the prefix of the QName
176  * @name:  the localname of the QName
177  * @str:  the second xmlChar *
178  *
179  * Check if a QName is Equal to a given string
180  *
181  * Returns 1 if they are equal, 0 if they are different
182  */
183 
184 int
xmlStrQEqual(const xmlChar * pref,const xmlChar * name,const xmlChar * str)185 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
186     if (pref == NULL) return(xmlStrEqual(name, str));
187     if (name == NULL) return(0);
188     if (str == NULL) return(0);
189 
190     do {
191         if (*pref++ != *str) return(0);
192     } while ((*str++) && (*pref));
193     if (*str++ != ':') return(0);
194     do {
195         if (*name++ != *str) return(0);
196     } while (*str++);
197     return(1);
198 }
199 
200 /**
201  * xmlStrncmp:
202  * @str1:  the first xmlChar *
203  * @str2:  the second xmlChar *
204  * @len:  the max comparison length
205  *
206  * a strncmp for xmlChar's
207  *
208  * Returns the integer result of the comparison
209  */
210 
211 int
xmlStrncmp(const xmlChar * str1,const xmlChar * str2,int len)212 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
213     if (len <= 0) return(0);
214     if (str1 == str2) return(0);
215     if (str1 == NULL) return(-1);
216     if (str2 == NULL) return(1);
217 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
218     return(strncmp((const char *)str1, (const char *)str2, len));
219 #else
220     do {
221         int tmp = *str1++ - *str2;
222         if (tmp != 0 || --len == 0) return(tmp);
223     } while (*str2++ != 0);
224     return 0;
225 #endif
226 }
227 
228 static const xmlChar casemap[256] = {
229     0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
230     0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
231     0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
232     0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
233     0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
234     0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
235     0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
236     0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
237     0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
238     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
239     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
240     0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
241     0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
242     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
243     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
244     0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
245     0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
246     0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
247     0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
248     0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
249     0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
250     0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
251     0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
252     0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
253     0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
254     0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
255     0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
256     0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
257     0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
258     0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
259     0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
260     0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
261 };
262 
263 /**
264  * xmlStrcasecmp:
265  * @str1:  the first xmlChar *
266  * @str2:  the second xmlChar *
267  *
268  * a strcasecmp for xmlChar's
269  *
270  * Returns the integer result of the comparison
271  */
272 
273 int
xmlStrcasecmp(const xmlChar * str1,const xmlChar * str2)274 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
275     register int tmp;
276 
277     if (str1 == str2) return(0);
278     if (str1 == NULL) return(-1);
279     if (str2 == NULL) return(1);
280     do {
281         tmp = casemap[*str1++] - casemap[*str2];
282         if (tmp != 0) return(tmp);
283     } while (*str2++ != 0);
284     return 0;
285 }
286 
287 /**
288  * xmlStrncasecmp:
289  * @str1:  the first xmlChar *
290  * @str2:  the second xmlChar *
291  * @len:  the max comparison length
292  *
293  * a strncasecmp for xmlChar's
294  *
295  * Returns the integer result of the comparison
296  */
297 
298 int
xmlStrncasecmp(const xmlChar * str1,const xmlChar * str2,int len)299 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
300     register int tmp;
301 
302     if (len <= 0) return(0);
303     if (str1 == str2) return(0);
304     if (str1 == NULL) return(-1);
305     if (str2 == NULL) return(1);
306     do {
307         tmp = casemap[*str1++] - casemap[*str2];
308         if (tmp != 0 || --len == 0) return(tmp);
309     } while (*str2++ != 0);
310     return 0;
311 }
312 
313 /**
314  * xmlStrchr:
315  * @str:  the xmlChar * array
316  * @val:  the xmlChar to search
317  *
318  * a strchr for xmlChar's
319  *
320  * Returns the xmlChar * for the first occurrence or NULL.
321  */
322 
323 const xmlChar *
xmlStrchr(const xmlChar * str,xmlChar val)324 xmlStrchr(const xmlChar *str, xmlChar val) {
325     if (str == NULL) return(NULL);
326     while (*str != 0) { /* non input consuming */
327         if (*str == val) return((xmlChar *) str);
328         str++;
329     }
330     return(NULL);
331 }
332 
333 /**
334  * xmlStrstr:
335  * @str:  the xmlChar * array (haystack)
336  * @val:  the xmlChar to search (needle)
337  *
338  * a strstr for xmlChar's
339  *
340  * Returns the xmlChar * for the first occurrence or NULL.
341  */
342 
343 const xmlChar *
xmlStrstr(const xmlChar * str,const xmlChar * val)344 xmlStrstr(const xmlChar *str, const xmlChar *val) {
345     int n;
346 
347     if (str == NULL) return(NULL);
348     if (val == NULL) return(NULL);
349     n = xmlStrlen(val);
350 
351     if (n == 0) return(str);
352     while (*str != 0) { /* non input consuming */
353         if (*str == *val) {
354             if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
355         }
356         str++;
357     }
358     return(NULL);
359 }
360 
361 /**
362  * xmlStrcasestr:
363  * @str:  the xmlChar * array (haystack)
364  * @val:  the xmlChar to search (needle)
365  *
366  * a case-ignoring strstr for xmlChar's
367  *
368  * Returns the xmlChar * for the first occurrence or NULL.
369  */
370 
371 const xmlChar *
xmlStrcasestr(const xmlChar * str,const xmlChar * val)372 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
373     int n;
374 
375     if (str == NULL) return(NULL);
376     if (val == NULL) return(NULL);
377     n = xmlStrlen(val);
378 
379     if (n == 0) return(str);
380     while (*str != 0) { /* non input consuming */
381         if (casemap[*str] == casemap[*val])
382             if (!xmlStrncasecmp(str, val, n)) return(str);
383         str++;
384     }
385     return(NULL);
386 }
387 
388 /**
389  * xmlStrsub:
390  * @str:  the xmlChar * array (haystack)
391  * @start:  the index of the first char (zero based)
392  * @len:  the length of the substring
393  *
394  * Extract a substring of a given string
395  *
396  * Returns the xmlChar * for the first occurrence or NULL.
397  */
398 
399 xmlChar *
xmlStrsub(const xmlChar * str,int start,int len)400 xmlStrsub(const xmlChar *str, int start, int len) {
401     int i;
402 
403     if (str == NULL) return(NULL);
404     if (start < 0) return(NULL);
405     if (len < 0) return(NULL);
406 
407     for (i = 0;i < start;i++) {
408         if (*str == 0) return(NULL);
409         str++;
410     }
411     if (*str == 0) return(NULL);
412     return(xmlStrndup(str, len));
413 }
414 
415 /**
416  * xmlStrlen:
417  * @str:  the xmlChar * array
418  *
419  * length of a xmlChar's string
420  *
421  * Returns the number of xmlChar contained in the ARRAY.
422  */
423 
424 int
xmlStrlen(const xmlChar * str)425 xmlStrlen(const xmlChar *str) {
426     int len = 0;
427 
428     if (str == NULL) return(0);
429     while (*str != 0) { /* non input consuming */
430         str++;
431         len++;
432     }
433     return(len);
434 }
435 
436 /**
437  * xmlStrncat:
438  * @cur:  the original xmlChar * array
439  * @add:  the xmlChar * array added
440  * @len:  the length of @add
441  *
442  * a strncat for array of xmlChar's, it will extend @cur with the len
443  * first bytes of @add. Note that if @len < 0 then this is an API error
444  * and NULL will be returned.
445  *
446  * Returns a new xmlChar *, the original @cur is reallocated and should
447  * not be freed.
448  */
449 
450 xmlChar *
xmlStrncat(xmlChar * cur,const xmlChar * add,int len)451 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
452     int size;
453     xmlChar *ret;
454 
455     if ((add == NULL) || (len == 0))
456         return(cur);
457     if (len < 0)
458 	return(NULL);
459     if (cur == NULL)
460         return(xmlStrndup(add, len));
461 
462     size = xmlStrlen(cur);
463     if (size < 0)
464         return(NULL);
465     ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
466     if (ret == NULL) {
467         xmlErrMemory(NULL, NULL);
468         return(cur);
469     }
470     memcpy(&ret[size], add, len * sizeof(xmlChar));
471     ret[size + len] = 0;
472     return(ret);
473 }
474 
475 /**
476  * xmlStrncatNew:
477  * @str1:  first xmlChar string
478  * @str2:  second xmlChar string
479  * @len:  the len of @str2 or < 0
480  *
481  * same as xmlStrncat, but creates a new string.  The original
482  * two strings are not freed. If @len is < 0 then the length
483  * will be calculated automatically.
484  *
485  * Returns a new xmlChar * or NULL
486  */
487 xmlChar *
xmlStrncatNew(const xmlChar * str1,const xmlChar * str2,int len)488 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
489     int size;
490     xmlChar *ret;
491 
492     if (len < 0) {
493         len = xmlStrlen(str2);
494         if (len < 0)
495             return(NULL);
496     }
497     if ((str2 == NULL) || (len == 0))
498         return(xmlStrdup(str1));
499     if (str1 == NULL)
500         return(xmlStrndup(str2, len));
501 
502     size = xmlStrlen(str1);
503     if (size < 0)
504         return(NULL);
505     ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
506     if (ret == NULL) {
507         xmlErrMemory(NULL, NULL);
508         return(xmlStrndup(str1, size));
509     }
510     memcpy(ret, str1, size * sizeof(xmlChar));
511     memcpy(&ret[size], str2, len * sizeof(xmlChar));
512     ret[size + len] = 0;
513     return(ret);
514 }
515 
516 /**
517  * xmlStrcat:
518  * @cur:  the original xmlChar * array
519  * @add:  the xmlChar * array added
520  *
521  * a strcat for array of xmlChar's. Since they are supposed to be
522  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
523  * a termination mark of '0'.
524  *
525  * Returns a new xmlChar * containing the concatenated string. The original
526  * @cur is reallocated and should not be freed.
527  */
528 xmlChar *
xmlStrcat(xmlChar * cur,const xmlChar * add)529 xmlStrcat(xmlChar *cur, const xmlChar *add) {
530     const xmlChar *p = add;
531 
532     if (add == NULL) return(cur);
533     if (cur == NULL)
534         return(xmlStrdup(add));
535 
536     while (*p != 0) p++; /* non input consuming */
537     return(xmlStrncat(cur, add, p - add));
538 }
539 
540 /**
541  * xmlStrPrintf:
542  * @buf:   the result buffer.
543  * @len:   the result buffer length.
544  * @msg:   the message with printf formatting.
545  * @...:   extra parameters for the message.
546  *
547  * Formats @msg and places result into @buf.
548  *
549  * Returns the number of characters written to @buf or -1 if an error occurs.
550  */
551 int XMLCDECL
xmlStrPrintf(xmlChar * buf,int len,const char * msg,...)552 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
553     va_list args;
554     int ret;
555 
556     if((buf == NULL) || (msg == NULL)) {
557         return(-1);
558     }
559 
560     va_start(args, msg);
561     ret = vsnprintf((char *) buf, len, (const char *) msg, args);
562     va_end(args);
563     buf[len - 1] = 0; /* be safe ! */
564 
565     return(ret);
566 }
567 
568 /**
569  * xmlStrVPrintf:
570  * @buf:   the result buffer.
571  * @len:   the result buffer length.
572  * @msg:   the message with printf formatting.
573  * @ap:    extra parameters for the message.
574  *
575  * Formats @msg and places result into @buf.
576  *
577  * Returns the number of characters written to @buf or -1 if an error occurs.
578  */
579 int
xmlStrVPrintf(xmlChar * buf,int len,const char * msg,va_list ap)580 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
581     int ret;
582 
583     if((buf == NULL) || (msg == NULL)) {
584         return(-1);
585     }
586 
587     ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
588     buf[len - 1] = 0; /* be safe ! */
589 
590     return(ret);
591 }
592 
593 /************************************************************************
594  *                                                                      *
595  *              Generic UTF8 handling routines                          *
596  *                                                                      *
597  * From rfc2044: encoding of the Unicode values on UTF-8:               *
598  *                                                                      *
599  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
600  * 0000 0000-0000 007F   0xxxxxxx                                       *
601  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
602  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
603  *                                                                      *
604  * I hope we won't use values > 0xFFFF anytime soon !                   *
605  *                                                                      *
606  ************************************************************************/
607 
608 
609 /**
610  * xmlUTF8Size:
611  * @utf: pointer to the UTF8 character
612  *
613  * calculates the internal size of a UTF8 character
614  *
615  * returns the numbers of bytes in the character, -1 on format error
616  */
617 int
xmlUTF8Size(const xmlChar * utf)618 xmlUTF8Size(const xmlChar *utf) {
619     xmlChar mask;
620     int len;
621 
622     if (utf == NULL)
623         return -1;
624     if (*utf < 0x80)
625         return 1;
626     /* check valid UTF8 character */
627     if (!(*utf & 0x40))
628         return -1;
629     /* determine number of bytes in char */
630     len = 2;
631     for (mask=0x20; mask != 0; mask>>=1) {
632         if (!(*utf & mask))
633             return len;
634         len++;
635     }
636     return -1;
637 }
638 
639 /**
640  * xmlUTF8Charcmp:
641  * @utf1: pointer to first UTF8 char
642  * @utf2: pointer to second UTF8 char
643  *
644  * compares the two UCS4 values
645  *
646  * returns result of the compare as with xmlStrncmp
647  */
648 int
xmlUTF8Charcmp(const xmlChar * utf1,const xmlChar * utf2)649 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
650 
651     if (utf1 == NULL ) {
652         if (utf2 == NULL)
653             return 0;
654         return -1;
655     }
656     return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
657 }
658 
659 /**
660  * xmlUTF8Strlen:
661  * @utf:  a sequence of UTF-8 encoded bytes
662  *
663  * compute the length of an UTF8 string, it doesn't do a full UTF8
664  * checking of the content of the string.
665  *
666  * Returns the number of characters in the string or -1 in case of error
667  */
668 int
xmlUTF8Strlen(const xmlChar * utf)669 xmlUTF8Strlen(const xmlChar *utf) {
670     int ret = 0;
671 
672     if (utf == NULL)
673         return(-1);
674 
675     while (*utf != 0) {
676         if (utf[0] & 0x80) {
677             if ((utf[1] & 0xc0) != 0x80)
678                 return(-1);
679             if ((utf[0] & 0xe0) == 0xe0) {
680                 if ((utf[2] & 0xc0) != 0x80)
681                     return(-1);
682                 if ((utf[0] & 0xf0) == 0xf0) {
683                     if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
684                         return(-1);
685                     utf += 4;
686                 } else {
687                     utf += 3;
688                 }
689             } else {
690                 utf += 2;
691             }
692         } else {
693             utf++;
694         }
695         ret++;
696     }
697     return(ret);
698 }
699 
700 /**
701  * xmlGetUTF8Char:
702  * @utf:  a sequence of UTF-8 encoded bytes
703  * @len:  a pointer to the minimum number of bytes present in
704  *        the sequence.  This is used to assure the next character
705  *        is completely contained within the sequence.
706  *
707  * Read the first UTF8 character from @utf
708  *
709  * Returns the char value or -1 in case of error, and sets *len to
710  *        the actual number of bytes consumed (0 in case of error)
711  */
712 int
xmlGetUTF8Char(const unsigned char * utf,int * len)713 xmlGetUTF8Char(const unsigned char *utf, int *len) {
714     unsigned int c;
715 
716     if (utf == NULL)
717         goto error;
718     if (len == NULL)
719         goto error;
720     if (*len < 1)
721         goto error;
722 
723     c = utf[0];
724     if (c & 0x80) {
725         if (*len < 2)
726             goto error;
727         if ((utf[1] & 0xc0) != 0x80)
728             goto error;
729         if ((c & 0xe0) == 0xe0) {
730             if (*len < 3)
731                 goto error;
732             if ((utf[2] & 0xc0) != 0x80)
733                 goto error;
734             if ((c & 0xf0) == 0xf0) {
735                 if (*len < 4)
736                     goto error;
737                 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
738                     goto error;
739                 *len = 4;
740                 /* 4-byte code */
741                 c = (utf[0] & 0x7) << 18;
742                 c |= (utf[1] & 0x3f) << 12;
743                 c |= (utf[2] & 0x3f) << 6;
744                 c |= utf[3] & 0x3f;
745             } else {
746               /* 3-byte code */
747                 *len = 3;
748                 c = (utf[0] & 0xf) << 12;
749                 c |= (utf[1] & 0x3f) << 6;
750                 c |= utf[2] & 0x3f;
751             }
752         } else {
753           /* 2-byte code */
754             *len = 2;
755             c = (utf[0] & 0x1f) << 6;
756             c |= utf[1] & 0x3f;
757         }
758     } else {
759         /* 1-byte code */
760         *len = 1;
761     }
762     return(c);
763 
764 error:
765     if (len != NULL)
766 	*len = 0;
767     return(-1);
768 }
769 
770 /**
771  * xmlCheckUTF8:
772  * @utf: Pointer to putative UTF-8 encoded string.
773  *
774  * Checks @utf for being valid UTF-8. @utf is assumed to be
775  * null-terminated. This function is not super-strict, as it will
776  * allow longer UTF-8 sequences than necessary. Note that Java is
777  * capable of producing these sequences if provoked. Also note, this
778  * routine checks for the 4-byte maximum size, but does not check for
779  * 0x10ffff maximum value.
780  *
781  * Return value: true if @utf is valid.
782  **/
783 int
xmlCheckUTF8(const unsigned char * utf)784 xmlCheckUTF8(const unsigned char *utf)
785 {
786     int ix;
787     unsigned char c;
788 
789     if (utf == NULL)
790         return(0);
791     /*
792      * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
793      * are as follows (in "bit format"):
794      *    0xxxxxxx                                      valid 1-byte
795      *    110xxxxx 10xxxxxx                             valid 2-byte
796      *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
797      *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
798      */
799     for (ix = 0; (c = utf[ix]);) {      /* string is 0-terminated */
800         if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
801             ix++;
802 	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
803 	    if ((utf[ix+1] & 0xc0 ) != 0x80)
804 	        return 0;
805 	    ix += 2;
806 	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
807 	    if (((utf[ix+1] & 0xc0) != 0x80) ||
808 	        ((utf[ix+2] & 0xc0) != 0x80))
809 		    return 0;
810 	    ix += 3;
811 	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
812 	    if (((utf[ix+1] & 0xc0) != 0x80) ||
813 	        ((utf[ix+2] & 0xc0) != 0x80) ||
814 		((utf[ix+3] & 0xc0) != 0x80))
815 		    return 0;
816 	    ix += 4;
817 	} else				/* unknown encoding */
818 	    return 0;
819       }
820       return(1);
821 }
822 
823 /**
824  * xmlUTF8Strsize:
825  * @utf:  a sequence of UTF-8 encoded bytes
826  * @len:  the number of characters in the array
827  *
828  * storage size of an UTF8 string
829  * the behaviour is not guaranteed if the input string is not UTF-8
830  *
831  * Returns the storage size of
832  * the first 'len' characters of ARRAY
833  */
834 
835 int
xmlUTF8Strsize(const xmlChar * utf,int len)836 xmlUTF8Strsize(const xmlChar *utf, int len) {
837     const xmlChar   *ptr=utf;
838     xmlChar         ch;
839 
840     if (utf == NULL)
841         return(0);
842 
843     if (len <= 0)
844         return(0);
845 
846     while ( len-- > 0) {
847         if ( !*ptr )
848             break;
849         if ( (ch = *ptr++) & 0x80)
850             while ((ch<<=1) & 0x80 ) {
851 		if (*ptr == 0) break;
852                 ptr++;
853 	    }
854     }
855     return (ptr - utf);
856 }
857 
858 
859 /**
860  * xmlUTF8Strndup:
861  * @utf:  the input UTF8 *
862  * @len:  the len of @utf (in chars)
863  *
864  * a strndup for array of UTF8's
865  *
866  * Returns a new UTF8 * or NULL
867  */
868 xmlChar *
xmlUTF8Strndup(const xmlChar * utf,int len)869 xmlUTF8Strndup(const xmlChar *utf, int len) {
870     xmlChar *ret;
871     int i;
872 
873     if ((utf == NULL) || (len < 0)) return(NULL);
874     i = xmlUTF8Strsize(utf, len);
875     ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
876     if (ret == NULL) {
877         xmlGenericError(xmlGenericErrorContext,
878                 "malloc of %ld byte failed\n",
879                 (len + 1) * (long)sizeof(xmlChar));
880         return(NULL);
881     }
882     memcpy(ret, utf, i * sizeof(xmlChar));
883     ret[i] = 0;
884     return(ret);
885 }
886 
887 /**
888  * xmlUTF8Strpos:
889  * @utf:  the input UTF8 *
890  * @pos:  the position of the desired UTF8 char (in chars)
891  *
892  * a function to provide the equivalent of fetching a
893  * character from a string array
894  *
895  * Returns a pointer to the UTF8 character or NULL
896  */
897 const xmlChar *
xmlUTF8Strpos(const xmlChar * utf,int pos)898 xmlUTF8Strpos(const xmlChar *utf, int pos) {
899     xmlChar ch;
900 
901     if (utf == NULL) return(NULL);
902     if (pos < 0)
903         return(NULL);
904     while (pos--) {
905         if ((ch=*utf++) == 0) return(NULL);
906         if ( ch & 0x80 ) {
907             /* if not simple ascii, verify proper format */
908             if ( (ch & 0xc0) != 0xc0 )
909                 return(NULL);
910             /* then skip over remaining bytes for this char */
911             while ( (ch <<= 1) & 0x80 )
912                 if ( (*utf++ & 0xc0) != 0x80 )
913                     return(NULL);
914         }
915     }
916     return((xmlChar *)utf);
917 }
918 
919 /**
920  * xmlUTF8Strloc:
921  * @utf:  the input UTF8 *
922  * @utfchar:  the UTF8 character to be found
923  *
924  * a function to provide the relative location of a UTF8 char
925  *
926  * Returns the relative character position of the desired char
927  * or -1 if not found
928  */
929 int
xmlUTF8Strloc(const xmlChar * utf,const xmlChar * utfchar)930 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
931     int i, size;
932     xmlChar ch;
933 
934     if (utf==NULL || utfchar==NULL) return -1;
935     size = xmlUTF8Strsize(utfchar, 1);
936         for(i=0; (ch=*utf) != 0; i++) {
937             if (xmlStrncmp(utf, utfchar, size)==0)
938                 return(i);
939             utf++;
940             if ( ch & 0x80 ) {
941                 /* if not simple ascii, verify proper format */
942                 if ( (ch & 0xc0) != 0xc0 )
943                     return(-1);
944                 /* then skip over remaining bytes for this char */
945                 while ( (ch <<= 1) & 0x80 )
946                     if ( (*utf++ & 0xc0) != 0x80 )
947                         return(-1);
948             }
949         }
950 
951     return(-1);
952 }
953 /**
954  * xmlUTF8Strsub:
955  * @utf:  a sequence of UTF-8 encoded bytes
956  * @start: relative pos of first char
957  * @len:   total number to copy
958  *
959  * Create a substring from a given UTF-8 string
960  * Note:  positions are given in units of UTF-8 chars
961  *
962  * Returns a pointer to a newly created string
963  * or NULL if any problem
964  */
965 
966 xmlChar *
xmlUTF8Strsub(const xmlChar * utf,int start,int len)967 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
968     int            i;
969     xmlChar ch;
970 
971     if (utf == NULL) return(NULL);
972     if (start < 0) return(NULL);
973     if (len < 0) return(NULL);
974 
975     /*
976      * Skip over any leading chars
977      */
978     for (i = 0;i < start;i++) {
979         if ((ch=*utf++) == 0) return(NULL);
980         if ( ch & 0x80 ) {
981             /* if not simple ascii, verify proper format */
982             if ( (ch & 0xc0) != 0xc0 )
983                 return(NULL);
984             /* then skip over remaining bytes for this char */
985             while ( (ch <<= 1) & 0x80 )
986                 if ( (*utf++ & 0xc0) != 0x80 )
987                     return(NULL);
988         }
989     }
990 
991     return(xmlUTF8Strndup(utf, len));
992 }
993 
994 /**
995  * xmlEscapeFormatString:
996  * @msg:  a pointer to the string in which to escape '%' characters.
997  * Must be a heap-allocated buffer created by libxml2 that may be
998  * returned, or that may be freed and replaced.
999  *
1000  * Replaces the string pointed to by 'msg' with an escaped string.
1001  * Returns the same string with all '%' characters escaped.
1002  */
1003 xmlChar *
xmlEscapeFormatString(xmlChar ** msg)1004 xmlEscapeFormatString(xmlChar **msg)
1005 {
1006     xmlChar *msgPtr = NULL;
1007     xmlChar *result = NULL;
1008     xmlChar *resultPtr = NULL;
1009     size_t count = 0;
1010     size_t msgLen = 0;
1011     size_t resultLen = 0;
1012 
1013     if (!msg || !*msg)
1014         return(NULL);
1015 
1016     for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1017         ++msgLen;
1018         if (*msgPtr == '%')
1019             ++count;
1020     }
1021 
1022     if (count == 0)
1023         return(*msg);
1024 
1025     resultLen = msgLen + count + 1;
1026     result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1027     if (result == NULL) {
1028         /* Clear *msg to prevent format string vulnerabilities in
1029            out-of-memory situations. */
1030         xmlFree(*msg);
1031         *msg = NULL;
1032         xmlErrMemory(NULL, NULL);
1033         return(NULL);
1034     }
1035 
1036     for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1037         *resultPtr = *msgPtr;
1038         if (*msgPtr == '%')
1039             *(++resultPtr) = '%';
1040     }
1041     result[resultLen - 1] = '\0';
1042 
1043     xmlFree(*msg);
1044     *msg = result;
1045 
1046     return *msg;
1047 }
1048 
1049 #define bottom_xmlstring
1050 #include "elfgcchack.h"
1051