1 /*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16 #define IN_LIBXML
17 #include "libxml.h"
18
19 #include <stdlib.h>
20 #include <string.h>
21 #include <libxml/xmlmemory.h>
22 #include <libxml/parserInternals.h>
23 #include <libxml/xmlstring.h>
24
25 /************************************************************************
26 * *
27 * Commodity functions to handle xmlChars *
28 * *
29 ************************************************************************/
30
31 /**
32 * xmlStrndup:
33 * @cur: the input xmlChar *
34 * @len: the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40 xmlChar *
xmlStrndup(const xmlChar * cur,int len)41 xmlStrndup(const xmlChar *cur, int len) {
42 xmlChar *ret;
43
44 if ((cur == NULL) || (len < 0)) return(NULL);
45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46 if (ret == NULL) {
47 xmlErrMemory(NULL, NULL);
48 return(NULL);
49 }
50 memcpy(ret, cur, len * sizeof(xmlChar));
51 ret[len] = 0;
52 return(ret);
53 }
54
55 /**
56 * xmlStrdup:
57 * @cur: the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65 xmlChar *
xmlStrdup(const xmlChar * cur)66 xmlStrdup(const xmlChar *cur) {
67 const xmlChar *p = cur;
68
69 if (cur == NULL) return(NULL);
70 while (*p != 0) p++; /* non input consuming */
71 return(xmlStrndup(cur, p - cur));
72 }
73
74 /**
75 * xmlCharStrndup:
76 * @cur: the input char *
77 * @len: the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84 xmlChar *
xmlCharStrndup(const char * cur,int len)85 xmlCharStrndup(const char *cur, int len) {
86 int i;
87 xmlChar *ret;
88
89 if ((cur == NULL) || (len < 0)) return(NULL);
90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91 if (ret == NULL) {
92 xmlErrMemory(NULL, NULL);
93 return(NULL);
94 }
95 for (i = 0;i < len;i++) {
96 ret[i] = (xmlChar) cur[i];
97 if (ret[i] == 0) return(ret);
98 }
99 ret[len] = 0;
100 return(ret);
101 }
102
103 /**
104 * xmlCharStrdup:
105 * @cur: the input char *
106 *
107 * a strdup for char's to xmlChar's
108 *
109 * Returns a new xmlChar * or NULL
110 */
111
112 xmlChar *
xmlCharStrdup(const char * cur)113 xmlCharStrdup(const char *cur) {
114 const char *p = cur;
115
116 if (cur == NULL) return(NULL);
117 while (*p != '\0') p++; /* non input consuming */
118 return(xmlCharStrndup(cur, p - cur));
119 }
120
121 /**
122 * xmlStrcmp:
123 * @str1: the first xmlChar *
124 * @str2: the second xmlChar *
125 *
126 * a strcmp for xmlChar's
127 *
128 * Returns the integer result of the comparison
129 */
130
131 int
xmlStrcmp(const xmlChar * str1,const xmlChar * str2)132 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133 if (str1 == str2) return(0);
134 if (str1 == NULL) return(-1);
135 if (str2 == NULL) return(1);
136 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
137 return(strcmp((const char *)str1, (const char *)str2));
138 #else
139 do {
140 int tmp = *str1++ - *str2;
141 if (tmp != 0) return(tmp);
142 } while (*str2++ != 0);
143 return 0;
144 #endif
145 }
146
147 /**
148 * xmlStrEqual:
149 * @str1: the first xmlChar *
150 * @str2: the second xmlChar *
151 *
152 * Check if both strings are equal of have same content.
153 * Should be a bit more readable and faster than xmlStrcmp()
154 *
155 * Returns 1 if they are equal, 0 if they are different
156 */
157
158 int
xmlStrEqual(const xmlChar * str1,const xmlChar * str2)159 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
160 if (str1 == str2) return(1);
161 if (str1 == NULL) return(0);
162 if (str2 == NULL) return(0);
163 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
164 return(strcmp((const char *)str1, (const char *)str2) == 0);
165 #else
166 do {
167 if (*str1++ != *str2) return(0);
168 } while (*str2++);
169 return(1);
170 #endif
171 }
172
173 /**
174 * xmlStrQEqual:
175 * @pref: the prefix of the QName
176 * @name: the localname of the QName
177 * @str: the second xmlChar *
178 *
179 * Check if a QName is Equal to a given string
180 *
181 * Returns 1 if they are equal, 0 if they are different
182 */
183
184 int
xmlStrQEqual(const xmlChar * pref,const xmlChar * name,const xmlChar * str)185 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
186 if (pref == NULL) return(xmlStrEqual(name, str));
187 if (name == NULL) return(0);
188 if (str == NULL) return(0);
189
190 do {
191 if (*pref++ != *str) return(0);
192 } while ((*str++) && (*pref));
193 if (*str++ != ':') return(0);
194 do {
195 if (*name++ != *str) return(0);
196 } while (*str++);
197 return(1);
198 }
199
200 /**
201 * xmlStrncmp:
202 * @str1: the first xmlChar *
203 * @str2: the second xmlChar *
204 * @len: the max comparison length
205 *
206 * a strncmp for xmlChar's
207 *
208 * Returns the integer result of the comparison
209 */
210
211 int
xmlStrncmp(const xmlChar * str1,const xmlChar * str2,int len)212 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
213 if (len <= 0) return(0);
214 if (str1 == str2) return(0);
215 if (str1 == NULL) return(-1);
216 if (str2 == NULL) return(1);
217 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
218 return(strncmp((const char *)str1, (const char *)str2, len));
219 #else
220 do {
221 int tmp = *str1++ - *str2;
222 if (tmp != 0 || --len == 0) return(tmp);
223 } while (*str2++ != 0);
224 return 0;
225 #endif
226 }
227
228 static const xmlChar casemap[256] = {
229 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
230 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
231 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
232 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
233 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
234 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
235 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
236 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
237 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
238 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
239 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
240 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
241 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
242 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
243 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
244 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
245 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
246 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
247 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
248 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
249 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
250 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
251 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
252 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
253 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
254 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
255 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
256 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
257 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
258 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
259 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
260 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
261 };
262
263 /**
264 * xmlStrcasecmp:
265 * @str1: the first xmlChar *
266 * @str2: the second xmlChar *
267 *
268 * a strcasecmp for xmlChar's
269 *
270 * Returns the integer result of the comparison
271 */
272
273 int
xmlStrcasecmp(const xmlChar * str1,const xmlChar * str2)274 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
275 register int tmp;
276
277 if (str1 == str2) return(0);
278 if (str1 == NULL) return(-1);
279 if (str2 == NULL) return(1);
280 do {
281 tmp = casemap[*str1++] - casemap[*str2];
282 if (tmp != 0) return(tmp);
283 } while (*str2++ != 0);
284 return 0;
285 }
286
287 /**
288 * xmlStrncasecmp:
289 * @str1: the first xmlChar *
290 * @str2: the second xmlChar *
291 * @len: the max comparison length
292 *
293 * a strncasecmp for xmlChar's
294 *
295 * Returns the integer result of the comparison
296 */
297
298 int
xmlStrncasecmp(const xmlChar * str1,const xmlChar * str2,int len)299 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
300 register int tmp;
301
302 if (len <= 0) return(0);
303 if (str1 == str2) return(0);
304 if (str1 == NULL) return(-1);
305 if (str2 == NULL) return(1);
306 do {
307 tmp = casemap[*str1++] - casemap[*str2];
308 if (tmp != 0 || --len == 0) return(tmp);
309 } while (*str2++ != 0);
310 return 0;
311 }
312
313 /**
314 * xmlStrchr:
315 * @str: the xmlChar * array
316 * @val: the xmlChar to search
317 *
318 * a strchr for xmlChar's
319 *
320 * Returns the xmlChar * for the first occurrence or NULL.
321 */
322
323 const xmlChar *
xmlStrchr(const xmlChar * str,xmlChar val)324 xmlStrchr(const xmlChar *str, xmlChar val) {
325 if (str == NULL) return(NULL);
326 while (*str != 0) { /* non input consuming */
327 if (*str == val) return((xmlChar *) str);
328 str++;
329 }
330 return(NULL);
331 }
332
333 /**
334 * xmlStrstr:
335 * @str: the xmlChar * array (haystack)
336 * @val: the xmlChar to search (needle)
337 *
338 * a strstr for xmlChar's
339 *
340 * Returns the xmlChar * for the first occurrence or NULL.
341 */
342
343 const xmlChar *
xmlStrstr(const xmlChar * str,const xmlChar * val)344 xmlStrstr(const xmlChar *str, const xmlChar *val) {
345 int n;
346
347 if (str == NULL) return(NULL);
348 if (val == NULL) return(NULL);
349 n = xmlStrlen(val);
350
351 if (n == 0) return(str);
352 while (*str != 0) { /* non input consuming */
353 if (*str == *val) {
354 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
355 }
356 str++;
357 }
358 return(NULL);
359 }
360
361 /**
362 * xmlStrcasestr:
363 * @str: the xmlChar * array (haystack)
364 * @val: the xmlChar to search (needle)
365 *
366 * a case-ignoring strstr for xmlChar's
367 *
368 * Returns the xmlChar * for the first occurrence or NULL.
369 */
370
371 const xmlChar *
xmlStrcasestr(const xmlChar * str,const xmlChar * val)372 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
373 int n;
374
375 if (str == NULL) return(NULL);
376 if (val == NULL) return(NULL);
377 n = xmlStrlen(val);
378
379 if (n == 0) return(str);
380 while (*str != 0) { /* non input consuming */
381 if (casemap[*str] == casemap[*val])
382 if (!xmlStrncasecmp(str, val, n)) return(str);
383 str++;
384 }
385 return(NULL);
386 }
387
388 /**
389 * xmlStrsub:
390 * @str: the xmlChar * array (haystack)
391 * @start: the index of the first char (zero based)
392 * @len: the length of the substring
393 *
394 * Extract a substring of a given string
395 *
396 * Returns the xmlChar * for the first occurrence or NULL.
397 */
398
399 xmlChar *
xmlStrsub(const xmlChar * str,int start,int len)400 xmlStrsub(const xmlChar *str, int start, int len) {
401 int i;
402
403 if (str == NULL) return(NULL);
404 if (start < 0) return(NULL);
405 if (len < 0) return(NULL);
406
407 for (i = 0;i < start;i++) {
408 if (*str == 0) return(NULL);
409 str++;
410 }
411 if (*str == 0) return(NULL);
412 return(xmlStrndup(str, len));
413 }
414
415 /**
416 * xmlStrlen:
417 * @str: the xmlChar * array
418 *
419 * length of a xmlChar's string
420 *
421 * Returns the number of xmlChar contained in the ARRAY.
422 */
423
424 int
xmlStrlen(const xmlChar * str)425 xmlStrlen(const xmlChar *str) {
426 int len = 0;
427
428 if (str == NULL) return(0);
429 while (*str != 0) { /* non input consuming */
430 str++;
431 len++;
432 }
433 return(len);
434 }
435
436 /**
437 * xmlStrncat:
438 * @cur: the original xmlChar * array
439 * @add: the xmlChar * array added
440 * @len: the length of @add
441 *
442 * a strncat for array of xmlChar's, it will extend @cur with the len
443 * first bytes of @add. Note that if @len < 0 then this is an API error
444 * and NULL will be returned.
445 *
446 * Returns a new xmlChar *, the original @cur is reallocated and should
447 * not be freed.
448 */
449
450 xmlChar *
xmlStrncat(xmlChar * cur,const xmlChar * add,int len)451 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
452 int size;
453 xmlChar *ret;
454
455 if ((add == NULL) || (len == 0))
456 return(cur);
457 if (len < 0)
458 return(NULL);
459 if (cur == NULL)
460 return(xmlStrndup(add, len));
461
462 size = xmlStrlen(cur);
463 if (size < 0)
464 return(NULL);
465 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
466 if (ret == NULL) {
467 xmlErrMemory(NULL, NULL);
468 return(cur);
469 }
470 memcpy(&ret[size], add, len * sizeof(xmlChar));
471 ret[size + len] = 0;
472 return(ret);
473 }
474
475 /**
476 * xmlStrncatNew:
477 * @str1: first xmlChar string
478 * @str2: second xmlChar string
479 * @len: the len of @str2 or < 0
480 *
481 * same as xmlStrncat, but creates a new string. The original
482 * two strings are not freed. If @len is < 0 then the length
483 * will be calculated automatically.
484 *
485 * Returns a new xmlChar * or NULL
486 */
487 xmlChar *
xmlStrncatNew(const xmlChar * str1,const xmlChar * str2,int len)488 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
489 int size;
490 xmlChar *ret;
491
492 if (len < 0) {
493 len = xmlStrlen(str2);
494 if (len < 0)
495 return(NULL);
496 }
497 if ((str2 == NULL) || (len == 0))
498 return(xmlStrdup(str1));
499 if (str1 == NULL)
500 return(xmlStrndup(str2, len));
501
502 size = xmlStrlen(str1);
503 if (size < 0)
504 return(NULL);
505 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
506 if (ret == NULL) {
507 xmlErrMemory(NULL, NULL);
508 return(xmlStrndup(str1, size));
509 }
510 memcpy(ret, str1, size * sizeof(xmlChar));
511 memcpy(&ret[size], str2, len * sizeof(xmlChar));
512 ret[size + len] = 0;
513 return(ret);
514 }
515
516 /**
517 * xmlStrcat:
518 * @cur: the original xmlChar * array
519 * @add: the xmlChar * array added
520 *
521 * a strcat for array of xmlChar's. Since they are supposed to be
522 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
523 * a termination mark of '0'.
524 *
525 * Returns a new xmlChar * containing the concatenated string. The original
526 * @cur is reallocated and should not be freed.
527 */
528 xmlChar *
xmlStrcat(xmlChar * cur,const xmlChar * add)529 xmlStrcat(xmlChar *cur, const xmlChar *add) {
530 const xmlChar *p = add;
531
532 if (add == NULL) return(cur);
533 if (cur == NULL)
534 return(xmlStrdup(add));
535
536 while (*p != 0) p++; /* non input consuming */
537 return(xmlStrncat(cur, add, p - add));
538 }
539
540 /**
541 * xmlStrPrintf:
542 * @buf: the result buffer.
543 * @len: the result buffer length.
544 * @msg: the message with printf formatting.
545 * @...: extra parameters for the message.
546 *
547 * Formats @msg and places result into @buf.
548 *
549 * Returns the number of characters written to @buf or -1 if an error occurs.
550 */
551 int XMLCDECL
xmlStrPrintf(xmlChar * buf,int len,const char * msg,...)552 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
553 va_list args;
554 int ret;
555
556 if((buf == NULL) || (msg == NULL)) {
557 return(-1);
558 }
559
560 va_start(args, msg);
561 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
562 va_end(args);
563 buf[len - 1] = 0; /* be safe ! */
564
565 return(ret);
566 }
567
568 /**
569 * xmlStrVPrintf:
570 * @buf: the result buffer.
571 * @len: the result buffer length.
572 * @msg: the message with printf formatting.
573 * @ap: extra parameters for the message.
574 *
575 * Formats @msg and places result into @buf.
576 *
577 * Returns the number of characters written to @buf or -1 if an error occurs.
578 */
579 int
xmlStrVPrintf(xmlChar * buf,int len,const char * msg,va_list ap)580 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
581 int ret;
582
583 if((buf == NULL) || (msg == NULL)) {
584 return(-1);
585 }
586
587 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
588 buf[len - 1] = 0; /* be safe ! */
589
590 return(ret);
591 }
592
593 /************************************************************************
594 * *
595 * Generic UTF8 handling routines *
596 * *
597 * From rfc2044: encoding of the Unicode values on UTF-8: *
598 * *
599 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
600 * 0000 0000-0000 007F 0xxxxxxx *
601 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
602 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
603 * *
604 * I hope we won't use values > 0xFFFF anytime soon ! *
605 * *
606 ************************************************************************/
607
608
609 /**
610 * xmlUTF8Size:
611 * @utf: pointer to the UTF8 character
612 *
613 * calculates the internal size of a UTF8 character
614 *
615 * returns the numbers of bytes in the character, -1 on format error
616 */
617 int
xmlUTF8Size(const xmlChar * utf)618 xmlUTF8Size(const xmlChar *utf) {
619 xmlChar mask;
620 int len;
621
622 if (utf == NULL)
623 return -1;
624 if (*utf < 0x80)
625 return 1;
626 /* check valid UTF8 character */
627 if (!(*utf & 0x40))
628 return -1;
629 /* determine number of bytes in char */
630 len = 2;
631 for (mask=0x20; mask != 0; mask>>=1) {
632 if (!(*utf & mask))
633 return len;
634 len++;
635 }
636 return -1;
637 }
638
639 /**
640 * xmlUTF8Charcmp:
641 * @utf1: pointer to first UTF8 char
642 * @utf2: pointer to second UTF8 char
643 *
644 * compares the two UCS4 values
645 *
646 * returns result of the compare as with xmlStrncmp
647 */
648 int
xmlUTF8Charcmp(const xmlChar * utf1,const xmlChar * utf2)649 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
650
651 if (utf1 == NULL ) {
652 if (utf2 == NULL)
653 return 0;
654 return -1;
655 }
656 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
657 }
658
659 /**
660 * xmlUTF8Strlen:
661 * @utf: a sequence of UTF-8 encoded bytes
662 *
663 * compute the length of an UTF8 string, it doesn't do a full UTF8
664 * checking of the content of the string.
665 *
666 * Returns the number of characters in the string or -1 in case of error
667 */
668 int
xmlUTF8Strlen(const xmlChar * utf)669 xmlUTF8Strlen(const xmlChar *utf) {
670 int ret = 0;
671
672 if (utf == NULL)
673 return(-1);
674
675 while (*utf != 0) {
676 if (utf[0] & 0x80) {
677 if ((utf[1] & 0xc0) != 0x80)
678 return(-1);
679 if ((utf[0] & 0xe0) == 0xe0) {
680 if ((utf[2] & 0xc0) != 0x80)
681 return(-1);
682 if ((utf[0] & 0xf0) == 0xf0) {
683 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
684 return(-1);
685 utf += 4;
686 } else {
687 utf += 3;
688 }
689 } else {
690 utf += 2;
691 }
692 } else {
693 utf++;
694 }
695 ret++;
696 }
697 return(ret);
698 }
699
700 /**
701 * xmlGetUTF8Char:
702 * @utf: a sequence of UTF-8 encoded bytes
703 * @len: a pointer to the minimum number of bytes present in
704 * the sequence. This is used to assure the next character
705 * is completely contained within the sequence.
706 *
707 * Read the first UTF8 character from @utf
708 *
709 * Returns the char value or -1 in case of error, and sets *len to
710 * the actual number of bytes consumed (0 in case of error)
711 */
712 int
xmlGetUTF8Char(const unsigned char * utf,int * len)713 xmlGetUTF8Char(const unsigned char *utf, int *len) {
714 unsigned int c;
715
716 if (utf == NULL)
717 goto error;
718 if (len == NULL)
719 goto error;
720 if (*len < 1)
721 goto error;
722
723 c = utf[0];
724 if (c & 0x80) {
725 if (*len < 2)
726 goto error;
727 if ((utf[1] & 0xc0) != 0x80)
728 goto error;
729 if ((c & 0xe0) == 0xe0) {
730 if (*len < 3)
731 goto error;
732 if ((utf[2] & 0xc0) != 0x80)
733 goto error;
734 if ((c & 0xf0) == 0xf0) {
735 if (*len < 4)
736 goto error;
737 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
738 goto error;
739 *len = 4;
740 /* 4-byte code */
741 c = (utf[0] & 0x7) << 18;
742 c |= (utf[1] & 0x3f) << 12;
743 c |= (utf[2] & 0x3f) << 6;
744 c |= utf[3] & 0x3f;
745 } else {
746 /* 3-byte code */
747 *len = 3;
748 c = (utf[0] & 0xf) << 12;
749 c |= (utf[1] & 0x3f) << 6;
750 c |= utf[2] & 0x3f;
751 }
752 } else {
753 /* 2-byte code */
754 *len = 2;
755 c = (utf[0] & 0x1f) << 6;
756 c |= utf[1] & 0x3f;
757 }
758 } else {
759 /* 1-byte code */
760 *len = 1;
761 }
762 return(c);
763
764 error:
765 if (len != NULL)
766 *len = 0;
767 return(-1);
768 }
769
770 /**
771 * xmlCheckUTF8:
772 * @utf: Pointer to putative UTF-8 encoded string.
773 *
774 * Checks @utf for being valid UTF-8. @utf is assumed to be
775 * null-terminated. This function is not super-strict, as it will
776 * allow longer UTF-8 sequences than necessary. Note that Java is
777 * capable of producing these sequences if provoked. Also note, this
778 * routine checks for the 4-byte maximum size, but does not check for
779 * 0x10ffff maximum value.
780 *
781 * Return value: true if @utf is valid.
782 **/
783 int
xmlCheckUTF8(const unsigned char * utf)784 xmlCheckUTF8(const unsigned char *utf)
785 {
786 int ix;
787 unsigned char c;
788
789 if (utf == NULL)
790 return(0);
791 /*
792 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
793 * are as follows (in "bit format"):
794 * 0xxxxxxx valid 1-byte
795 * 110xxxxx 10xxxxxx valid 2-byte
796 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
797 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
798 */
799 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
800 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
801 ix++;
802 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
803 if ((utf[ix+1] & 0xc0 ) != 0x80)
804 return 0;
805 ix += 2;
806 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
807 if (((utf[ix+1] & 0xc0) != 0x80) ||
808 ((utf[ix+2] & 0xc0) != 0x80))
809 return 0;
810 ix += 3;
811 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
812 if (((utf[ix+1] & 0xc0) != 0x80) ||
813 ((utf[ix+2] & 0xc0) != 0x80) ||
814 ((utf[ix+3] & 0xc0) != 0x80))
815 return 0;
816 ix += 4;
817 } else /* unknown encoding */
818 return 0;
819 }
820 return(1);
821 }
822
823 /**
824 * xmlUTF8Strsize:
825 * @utf: a sequence of UTF-8 encoded bytes
826 * @len: the number of characters in the array
827 *
828 * storage size of an UTF8 string
829 * the behaviour is not guaranteed if the input string is not UTF-8
830 *
831 * Returns the storage size of
832 * the first 'len' characters of ARRAY
833 */
834
835 int
xmlUTF8Strsize(const xmlChar * utf,int len)836 xmlUTF8Strsize(const xmlChar *utf, int len) {
837 const xmlChar *ptr=utf;
838 xmlChar ch;
839
840 if (utf == NULL)
841 return(0);
842
843 if (len <= 0)
844 return(0);
845
846 while ( len-- > 0) {
847 if ( !*ptr )
848 break;
849 if ( (ch = *ptr++) & 0x80)
850 while ((ch<<=1) & 0x80 ) {
851 if (*ptr == 0) break;
852 ptr++;
853 }
854 }
855 return (ptr - utf);
856 }
857
858
859 /**
860 * xmlUTF8Strndup:
861 * @utf: the input UTF8 *
862 * @len: the len of @utf (in chars)
863 *
864 * a strndup for array of UTF8's
865 *
866 * Returns a new UTF8 * or NULL
867 */
868 xmlChar *
xmlUTF8Strndup(const xmlChar * utf,int len)869 xmlUTF8Strndup(const xmlChar *utf, int len) {
870 xmlChar *ret;
871 int i;
872
873 if ((utf == NULL) || (len < 0)) return(NULL);
874 i = xmlUTF8Strsize(utf, len);
875 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
876 if (ret == NULL) {
877 xmlGenericError(xmlGenericErrorContext,
878 "malloc of %ld byte failed\n",
879 (len + 1) * (long)sizeof(xmlChar));
880 return(NULL);
881 }
882 memcpy(ret, utf, i * sizeof(xmlChar));
883 ret[i] = 0;
884 return(ret);
885 }
886
887 /**
888 * xmlUTF8Strpos:
889 * @utf: the input UTF8 *
890 * @pos: the position of the desired UTF8 char (in chars)
891 *
892 * a function to provide the equivalent of fetching a
893 * character from a string array
894 *
895 * Returns a pointer to the UTF8 character or NULL
896 */
897 const xmlChar *
xmlUTF8Strpos(const xmlChar * utf,int pos)898 xmlUTF8Strpos(const xmlChar *utf, int pos) {
899 xmlChar ch;
900
901 if (utf == NULL) return(NULL);
902 if (pos < 0)
903 return(NULL);
904 while (pos--) {
905 if ((ch=*utf++) == 0) return(NULL);
906 if ( ch & 0x80 ) {
907 /* if not simple ascii, verify proper format */
908 if ( (ch & 0xc0) != 0xc0 )
909 return(NULL);
910 /* then skip over remaining bytes for this char */
911 while ( (ch <<= 1) & 0x80 )
912 if ( (*utf++ & 0xc0) != 0x80 )
913 return(NULL);
914 }
915 }
916 return((xmlChar *)utf);
917 }
918
919 /**
920 * xmlUTF8Strloc:
921 * @utf: the input UTF8 *
922 * @utfchar: the UTF8 character to be found
923 *
924 * a function to provide the relative location of a UTF8 char
925 *
926 * Returns the relative character position of the desired char
927 * or -1 if not found
928 */
929 int
xmlUTF8Strloc(const xmlChar * utf,const xmlChar * utfchar)930 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
931 int i, size;
932 xmlChar ch;
933
934 if (utf==NULL || utfchar==NULL) return -1;
935 size = xmlUTF8Strsize(utfchar, 1);
936 for(i=0; (ch=*utf) != 0; i++) {
937 if (xmlStrncmp(utf, utfchar, size)==0)
938 return(i);
939 utf++;
940 if ( ch & 0x80 ) {
941 /* if not simple ascii, verify proper format */
942 if ( (ch & 0xc0) != 0xc0 )
943 return(-1);
944 /* then skip over remaining bytes for this char */
945 while ( (ch <<= 1) & 0x80 )
946 if ( (*utf++ & 0xc0) != 0x80 )
947 return(-1);
948 }
949 }
950
951 return(-1);
952 }
953 /**
954 * xmlUTF8Strsub:
955 * @utf: a sequence of UTF-8 encoded bytes
956 * @start: relative pos of first char
957 * @len: total number to copy
958 *
959 * Create a substring from a given UTF-8 string
960 * Note: positions are given in units of UTF-8 chars
961 *
962 * Returns a pointer to a newly created string
963 * or NULL if any problem
964 */
965
966 xmlChar *
xmlUTF8Strsub(const xmlChar * utf,int start,int len)967 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
968 int i;
969 xmlChar ch;
970
971 if (utf == NULL) return(NULL);
972 if (start < 0) return(NULL);
973 if (len < 0) return(NULL);
974
975 /*
976 * Skip over any leading chars
977 */
978 for (i = 0;i < start;i++) {
979 if ((ch=*utf++) == 0) return(NULL);
980 if ( ch & 0x80 ) {
981 /* if not simple ascii, verify proper format */
982 if ( (ch & 0xc0) != 0xc0 )
983 return(NULL);
984 /* then skip over remaining bytes for this char */
985 while ( (ch <<= 1) & 0x80 )
986 if ( (*utf++ & 0xc0) != 0x80 )
987 return(NULL);
988 }
989 }
990
991 return(xmlUTF8Strndup(utf, len));
992 }
993
994 /**
995 * xmlEscapeFormatString:
996 * @msg: a pointer to the string in which to escape '%' characters.
997 * Must be a heap-allocated buffer created by libxml2 that may be
998 * returned, or that may be freed and replaced.
999 *
1000 * Replaces the string pointed to by 'msg' with an escaped string.
1001 * Returns the same string with all '%' characters escaped.
1002 */
1003 xmlChar *
xmlEscapeFormatString(xmlChar ** msg)1004 xmlEscapeFormatString(xmlChar **msg)
1005 {
1006 xmlChar *msgPtr = NULL;
1007 xmlChar *result = NULL;
1008 xmlChar *resultPtr = NULL;
1009 size_t count = 0;
1010 size_t msgLen = 0;
1011 size_t resultLen = 0;
1012
1013 if (!msg || !*msg)
1014 return(NULL);
1015
1016 for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1017 ++msgLen;
1018 if (*msgPtr == '%')
1019 ++count;
1020 }
1021
1022 if (count == 0)
1023 return(*msg);
1024
1025 resultLen = msgLen + count + 1;
1026 result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1027 if (result == NULL) {
1028 /* Clear *msg to prevent format string vulnerabilities in
1029 out-of-memory situations. */
1030 xmlFree(*msg);
1031 *msg = NULL;
1032 xmlErrMemory(NULL, NULL);
1033 return(NULL);
1034 }
1035
1036 for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1037 *resultPtr = *msgPtr;
1038 if (*msgPtr == '%')
1039 *(++resultPtr) = '%';
1040 }
1041 result[resultLen - 1] = '\0';
1042
1043 xmlFree(*msg);
1044 *msg = result;
1045
1046 return *msg;
1047 }
1048
1049 #define bottom_xmlstring
1050 #include "elfgcchack.h"
1051