1 /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
2
3 /* libcroco - Library for parsing and applying CSS
4 * Copyright (C) 2006-2019 Free Software Foundation, Inc.
5 *
6 * This file is not part of the GNU gettext program, but is used with
7 * GNU gettext.
8 *
9 * The original copyright notice is as follows:
10 */
11
12 /*
13 * This file is part of The Croco Library
14 *
15 * Copyright (C) 2003-2004 Dodji Seketeli. All Rights Reserved.
16 *
17 * This program is free software; you can redistribute it and/or
18 * modify it under the terms of version 2.1 of the GNU Lesser General Public
19 * License as published by the Free Software Foundation.
20 *
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public License
27 * along with this program; if not, write to the Free Software
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
29 * USA
30 *
31 * Author: Dodji Seketeli
32 */
33
34 #include <config.h>
35 #include "cr-utils.h"
36 #include "cr-string.h"
37
38 /**
39 *@file:
40 *Some misc utility functions used
41 *in the libcroco.
42 *Note that troughout this file I will
43 *refer to the CSS SPECIFICATIONS DOCUMENTATION
44 *written by the w3c guys. You can find that document
45 *at http://www.w3.org/TR/REC-CSS2/ .
46 */
47
48 /****************************
49 *Encoding transformations and
50 *encoding helpers
51 ****************************/
52
53 /*
54 *Here is the correspondance between the ucs-4 charactere codes
55 *and there matching utf-8 encoding pattern as dscribed by RFC 2279:
56 *
57 *UCS-4 range (hex.) UTF-8 octet sequence (binary)
58 *------------------ -----------------------------
59 *0000 0000-0000 007F 0xxxxxxx
60 *0000 0080-0000 07FF 110xxxxx 10xxxxxx
61 *0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
62 *0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
63 *0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
64 *0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
65 */
66
67 /**
68 *Given an utf8 string buffer, calculates
69 *the length of this string if it was encoded
70 *in ucs4.
71 *@param a_in_start a pointer to the begining of
72 *the input utf8 string.
73 *@param a_in_end a pointre to the end of the input
74 *utf8 string (points to the last byte of the buffer)
75 *@param a_len out parameter the calculated length.
76 *@return CR_OK upon succesfull completion, an error code
77 *otherwise.
78 */
79 enum CRStatus
cr_utils_utf8_str_len_as_ucs4(const guchar * a_in_start,const guchar * a_in_end,gulong * a_len)80 cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
81 const guchar * a_in_end, gulong * a_len)
82 {
83 guchar *byte_ptr = NULL;
84 gint len = 0;
85
86 /*
87 *to store the final decoded
88 *unicode char
89 */
90 guint c = 0;
91
92 g_return_val_if_fail (a_in_start && a_in_end && a_len,
93 CR_BAD_PARAM_ERROR);
94 *a_len = 0;
95
96 for (byte_ptr = (guchar *) a_in_start;
97 byte_ptr <= a_in_end; byte_ptr++) {
98 gint nb_bytes_2_decode = 0;
99
100 if (*byte_ptr <= 0x7F) {
101 /*
102 *7 bits long char
103 *encoded over 1 byte:
104 * 0xxx xxxx
105 */
106 c = *byte_ptr;
107 nb_bytes_2_decode = 1;
108
109 } else if ((*byte_ptr & 0xE0) == 0xC0) {
110 /*
111 *up to 11 bits long char.
112 *encoded over 2 bytes:
113 *110x xxxx 10xx xxxx
114 */
115 c = *byte_ptr & 0x1F;
116 nb_bytes_2_decode = 2;
117
118 } else if ((*byte_ptr & 0xF0) == 0xE0) {
119 /*
120 *up to 16 bit long char
121 *encoded over 3 bytes:
122 *1110 xxxx 10xx xxxx 10xx xxxx
123 */
124 c = *byte_ptr & 0x0F;
125 nb_bytes_2_decode = 3;
126
127 } else if ((*byte_ptr & 0xF8) == 0xF0) {
128 /*
129 *up to 21 bits long char
130 *encoded over 4 bytes:
131 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
132 */
133 c = *byte_ptr & 0x7;
134 nb_bytes_2_decode = 4;
135
136 } else if ((*byte_ptr & 0xFC) == 0xF8) {
137 /*
138 *up to 26 bits long char
139 *encoded over 5 bytes.
140 *1111 10xx 10xx xxxx 10xx xxxx
141 *10xx xxxx 10xx xxxx
142 */
143 c = *byte_ptr & 3;
144 nb_bytes_2_decode = 5;
145
146 } else if ((*byte_ptr & 0xFE) == 0xFC) {
147 /*
148 *up to 31 bits long char
149 *encoded over 6 bytes:
150 *1111 110x 10xx xxxx 10xx xxxx
151 *10xx xxxx 10xx xxxx 10xx xxxx
152 */
153 c = *byte_ptr & 1;
154 nb_bytes_2_decode = 6;
155
156 } else {
157 /*
158 *BAD ENCODING
159 */
160 return CR_ENCODING_ERROR;
161 }
162
163 /*
164 *Go and decode the remaining byte(s)
165 *(if any) to get the current character.
166 */
167 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
168 /*decode the next byte */
169 byte_ptr++;
170
171 /*byte pattern must be: 10xx xxxx */
172 if ((*byte_ptr & 0xC0) != 0x80) {
173 return CR_ENCODING_ERROR;
174 }
175
176 c = (c << 6) | (*byte_ptr & 0x3F);
177 }
178
179 len++;
180 }
181
182 *a_len = len;
183
184 return CR_OK;
185 }
186
187 /**
188 *Given an ucs4 string, this function
189 *returns the size (in bytes) this string
190 *would have occupied if it was encoded in utf-8.
191 *@param a_in_start a pointer to the beginning of the input
192 *buffer.
193 *@param a_in_end a pointer to the end of the input buffer.
194 *@param a_len out parameter. The computed length.
195 *@return CR_OK upon successfull completion, an error code otherwise.
196 */
197 enum CRStatus
cr_utils_ucs4_str_len_as_utf8(const guint32 * a_in_start,const guint32 * a_in_end,gulong * a_len)198 cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
199 const guint32 * a_in_end, gulong * a_len)
200 {
201 gint len = 0;
202 guint32 *char_ptr = NULL;
203
204 g_return_val_if_fail (a_in_start && a_in_end && a_len,
205 CR_BAD_PARAM_ERROR);
206
207 for (char_ptr = (guint32 *) a_in_start;
208 char_ptr <= a_in_end; char_ptr++) {
209 if (*char_ptr <= 0x7F) {
210 /*the utf-8 char would take 1 byte */
211 len += 1;
212 } else if (*char_ptr <= 0x7FF) {
213 /*the utf-8 char would take 2 bytes */
214 len += 2;
215 } else if (*char_ptr <= 0xFFFF) {
216 len += 3;
217 } else if (*char_ptr <= 0x1FFFFF) {
218 len += 4;
219 } else if (*char_ptr <= 0x3FFFFFF) {
220 len += 5;
221 } else if (*char_ptr <= 0x7FFFFFFF) {
222 len += 6;
223 }
224 }
225
226 *a_len = len;
227 return CR_OK;
228 }
229
230 /**
231 *Given an ucsA string, this function
232 *returns the size (in bytes) this string
233 *would have occupied if it was encoded in utf-8.
234 *@param a_in_start a pointer to the beginning of the input
235 *buffer.
236 *@param a_in_end a pointer to the end of the input buffer.
237 *@param a_len out parameter. The computed length.
238 *@return CR_OK upon successfull completion, an error code otherwise.
239 */
240 enum CRStatus
cr_utils_ucs1_str_len_as_utf8(const guchar * a_in_start,const guchar * a_in_end,gulong * a_len)241 cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
242 const guchar * a_in_end, gulong * a_len)
243 {
244 gint len = 0;
245 guchar *char_ptr = NULL;
246
247 g_return_val_if_fail (a_in_start && a_in_end && a_len,
248 CR_BAD_PARAM_ERROR);
249
250 for (char_ptr = (guchar *) a_in_start;
251 char_ptr <= a_in_end; char_ptr++) {
252 if (*char_ptr <= 0x7F) {
253 /*the utf-8 char would take 1 byte */
254 len += 1;
255 } else {
256 /*the utf-8 char would take 2 bytes */
257 len += 2;
258 }
259 }
260
261 *a_len = len;
262 return CR_OK;
263 }
264
265 /**
266 *Converts an utf8 buffer into an ucs4 buffer.
267 *
268 *@param a_in the input utf8 buffer to convert.
269 *@param a_in_len in/out parameter. The size of the
270 *input buffer to convert. After return, this parameter contains
271 *the actual number of bytes consumed.
272 *@param a_out the output converted ucs4 buffer. Must be allocated by
273 *the caller.
274 *@param a_out_len in/out parameter. The size of the output buffer.
275 *If this size is actually smaller than the real needed size, the function
276 *just converts what it can and returns a success status. After return,
277 *this param points to the actual number of characters decoded.
278 *@return CR_OK upon successfull completion, an error code otherwise.
279 */
280 enum CRStatus
cr_utils_utf8_to_ucs4(const guchar * a_in,gulong * a_in_len,guint32 * a_out,gulong * a_out_len)281 cr_utils_utf8_to_ucs4 (const guchar * a_in,
282 gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
283 {
284 gulong in_len = 0,
285 out_len = 0,
286 in_index = 0,
287 out_index = 0;
288 enum CRStatus status = CR_OK;
289
290 /*
291 *to store the final decoded
292 *unicode char
293 */
294 guint c = 0;
295
296 g_return_val_if_fail (a_in && a_in_len
297 && a_out && a_out_len, CR_BAD_PARAM_ERROR);
298
299 if (*a_in_len < 1) {
300 status = CR_OK;
301 goto end;
302 }
303
304 in_len = *a_in_len;
305 out_len = *a_out_len;
306
307 for (in_index = 0, out_index = 0;
308 (in_index < in_len) && (out_index < out_len);
309 in_index++, out_index++) {
310 gint nb_bytes_2_decode = 0;
311
312 if (a_in[in_index] <= 0x7F) {
313 /*
314 *7 bits long char
315 *encoded over 1 byte:
316 * 0xxx xxxx
317 */
318 c = a_in[in_index];
319 nb_bytes_2_decode = 1;
320
321 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
322 /*
323 *up to 11 bits long char.
324 *encoded over 2 bytes:
325 *110x xxxx 10xx xxxx
326 */
327 c = a_in[in_index] & 0x1F;
328 nb_bytes_2_decode = 2;
329
330 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
331 /*
332 *up to 16 bit long char
333 *encoded over 3 bytes:
334 *1110 xxxx 10xx xxxx 10xx xxxx
335 */
336 c = a_in[in_index] & 0x0F;
337 nb_bytes_2_decode = 3;
338
339 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
340 /*
341 *up to 21 bits long char
342 *encoded over 4 bytes:
343 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
344 */
345 c = a_in[in_index] & 0x7;
346 nb_bytes_2_decode = 4;
347
348 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
349 /*
350 *up to 26 bits long char
351 *encoded over 5 bytes.
352 *1111 10xx 10xx xxxx 10xx xxxx
353 *10xx xxxx 10xx xxxx
354 */
355 c = a_in[in_index] & 3;
356 nb_bytes_2_decode = 5;
357
358 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
359 /*
360 *up to 31 bits long char
361 *encoded over 6 bytes:
362 *1111 110x 10xx xxxx 10xx xxxx
363 *10xx xxxx 10xx xxxx 10xx xxxx
364 */
365 c = a_in[in_index] & 1;
366 nb_bytes_2_decode = 6;
367
368 } else {
369 /*BAD ENCODING */
370 goto end;
371 }
372
373 /*
374 *Go and decode the remaining byte(s)
375 *(if any) to get the current character.
376 */
377 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
378 /*decode the next byte */
379 in_index++;
380
381 /*byte pattern must be: 10xx xxxx */
382 if ((a_in[in_index] & 0xC0) != 0x80) {
383 goto end;
384 }
385
386 c = (c << 6) | (a_in[in_index] & 0x3F);
387 }
388
389 /*
390 *The decoded ucs4 char is now
391 *in c.
392 */
393
394 /************************
395 *Some security tests
396 ***********************/
397
398 /*be sure c is a char */
399 if (c == 0xFFFF || c == 0xFFFE)
400 goto end;
401
402 /*be sure c is inferior to the max ucs4 char value */
403 if (c > 0x10FFFF)
404 goto end;
405
406 /*
407 *c must be less than UTF16 "lower surrogate begin"
408 *or higher than UTF16 "High surrogate end"
409 */
410 if (c >= 0xD800 && c <= 0xDFFF)
411 goto end;
412
413 /*Avoid characters that equals zero */
414 if (c == 0)
415 goto end;
416
417 a_out[out_index] = c;
418 }
419
420 end:
421 *a_out_len = out_index + 1;
422 *a_in_len = in_index + 1;
423
424 return status;
425 }
426
427 /**
428 *Reads a character from an utf8 buffer.
429 *Actually decode the next character code (unicode character code)
430 *and returns it.
431 *@param a_in the starting address of the utf8 buffer.
432 *@param a_in_len the length of the utf8 buffer.
433 *@param a_out output parameter. The resulting read char.
434 *@param a_consumed the number of the bytes consumed to
435 *decode the returned character code.
436 *@return CR_OK upon successfull completion, an error code otherwise.
437 */
438 enum CRStatus
cr_utils_read_char_from_utf8_buf(const guchar * a_in,gulong a_in_len,guint32 * a_out,gulong * a_consumed)439 cr_utils_read_char_from_utf8_buf (const guchar * a_in,
440 gulong a_in_len,
441 guint32 * a_out, gulong * a_consumed)
442 {
443 gulong in_index = 0,
444 nb_bytes_2_decode = 0;
445 enum CRStatus status = CR_OK;
446
447 /*
448 *to store the final decoded
449 *unicode char
450 */
451 guint32 c = 0;
452
453 g_return_val_if_fail (a_in && a_out && a_out
454 && a_consumed, CR_BAD_PARAM_ERROR);
455
456 if (a_in_len < 1) {
457 status = CR_OK;
458 goto end;
459 }
460
461 if (*a_in <= 0x7F) {
462 /*
463 *7 bits long char
464 *encoded over 1 byte:
465 * 0xxx xxxx
466 */
467 c = *a_in;
468 nb_bytes_2_decode = 1;
469
470 } else if ((*a_in & 0xE0) == 0xC0) {
471 /*
472 *up to 11 bits long char.
473 *encoded over 2 bytes:
474 *110x xxxx 10xx xxxx
475 */
476 c = *a_in & 0x1F;
477 nb_bytes_2_decode = 2;
478
479 } else if ((*a_in & 0xF0) == 0xE0) {
480 /*
481 *up to 16 bit long char
482 *encoded over 3 bytes:
483 *1110 xxxx 10xx xxxx 10xx xxxx
484 */
485 c = *a_in & 0x0F;
486 nb_bytes_2_decode = 3;
487
488 } else if ((*a_in & 0xF8) == 0xF0) {
489 /*
490 *up to 21 bits long char
491 *encoded over 4 bytes:
492 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
493 */
494 c = *a_in & 0x7;
495 nb_bytes_2_decode = 4;
496
497 } else if ((*a_in & 0xFC) == 0xF8) {
498 /*
499 *up to 26 bits long char
500 *encoded over 5 bytes.
501 *1111 10xx 10xx xxxx 10xx xxxx
502 *10xx xxxx 10xx xxxx
503 */
504 c = *a_in & 3;
505 nb_bytes_2_decode = 5;
506
507 } else if ((*a_in & 0xFE) == 0xFC) {
508 /*
509 *up to 31 bits long char
510 *encoded over 6 bytes:
511 *1111 110x 10xx xxxx 10xx xxxx
512 *10xx xxxx 10xx xxxx 10xx xxxx
513 */
514 c = *a_in & 1;
515 nb_bytes_2_decode = 6;
516
517 } else {
518 /*BAD ENCODING */
519 goto end;
520 }
521
522 if (nb_bytes_2_decode > a_in_len) {
523 status = CR_END_OF_INPUT_ERROR;
524 goto end;
525 }
526
527 /*
528 *Go and decode the remaining byte(s)
529 *(if any) to get the current character.
530 */
531 for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
532 /*byte pattern must be: 10xx xxxx */
533 if ((a_in[in_index] & 0xC0) != 0x80) {
534 goto end;
535 }
536
537 c = (c << 6) | (a_in[in_index] & 0x3F);
538 }
539
540 /*
541 *The decoded ucs4 char is now
542 *in c.
543 */
544
545 /************************
546 *Some security tests
547 ***********************/
548
549 /*be sure c is a char */
550 if (c == 0xFFFF || c == 0xFFFE)
551 goto end;
552
553 /*be sure c is inferior to the max ucs4 char value */
554 if (c > 0x10FFFF)
555 goto end;
556
557 /*
558 *c must be less than UTF16 "lower surrogate begin"
559 *or higher than UTF16 "High surrogate end"
560 */
561 if (c >= 0xD800 && c <= 0xDFFF)
562 goto end;
563
564 /*Avoid characters that equals zero */
565 if (c == 0)
566 goto end;
567
568 *a_out = c;
569
570 end:
571 *a_consumed = nb_bytes_2_decode;
572
573 return status;
574 }
575
576 /**
577 *
578 */
579 enum CRStatus
cr_utils_utf8_str_len_as_ucs1(const guchar * a_in_start,const guchar * a_in_end,gulong * a_len)580 cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
581 const guchar * a_in_end, gulong * a_len)
582 {
583 /*
584 *Note: this function can be made shorter
585 *but it considers all the cases of the utf8 encoding
586 *to ease further extensions ...
587 */
588
589 guchar *byte_ptr = NULL;
590 gint len = 0;
591
592 /*
593 *to store the final decoded
594 *unicode char
595 */
596 guint c = 0;
597
598 g_return_val_if_fail (a_in_start && a_in_end && a_len,
599 CR_BAD_PARAM_ERROR);
600 *a_len = 0;
601
602 for (byte_ptr = (guchar *) a_in_start;
603 byte_ptr <= a_in_end; byte_ptr++) {
604 gint nb_bytes_2_decode = 0;
605
606 if (*byte_ptr <= 0x7F) {
607 /*
608 *7 bits long char
609 *encoded over 1 byte:
610 * 0xxx xxxx
611 */
612 c = *byte_ptr;
613 nb_bytes_2_decode = 1;
614
615 } else if ((*byte_ptr & 0xE0) == 0xC0) {
616 /*
617 *up to 11 bits long char.
618 *encoded over 2 bytes:
619 *110x xxxx 10xx xxxx
620 */
621 c = *byte_ptr & 0x1F;
622 nb_bytes_2_decode = 2;
623
624 } else if ((*byte_ptr & 0xF0) == 0xE0) {
625 /*
626 *up to 16 bit long char
627 *encoded over 3 bytes:
628 *1110 xxxx 10xx xxxx 10xx xxxx
629 */
630 c = *byte_ptr & 0x0F;
631 nb_bytes_2_decode = 3;
632
633 } else if ((*byte_ptr & 0xF8) == 0xF0) {
634 /*
635 *up to 21 bits long char
636 *encoded over 4 bytes:
637 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
638 */
639 c = *byte_ptr & 0x7;
640 nb_bytes_2_decode = 4;
641
642 } else if ((*byte_ptr & 0xFC) == 0xF8) {
643 /*
644 *up to 26 bits long char
645 *encoded over 5 bytes.
646 *1111 10xx 10xx xxxx 10xx xxxx
647 *10xx xxxx 10xx xxxx
648 */
649 c = *byte_ptr & 3;
650 nb_bytes_2_decode = 5;
651
652 } else if ((*byte_ptr & 0xFE) == 0xFC) {
653 /*
654 *up to 31 bits long char
655 *encoded over 6 bytes:
656 *1111 110x 10xx xxxx 10xx xxxx
657 *10xx xxxx 10xx xxxx 10xx xxxx
658 */
659 c = *byte_ptr & 1;
660 nb_bytes_2_decode = 6;
661
662 } else {
663 /*
664 *BAD ENCODING
665 */
666 return CR_ENCODING_ERROR;
667 }
668
669 /*
670 *Go and decode the remaining byte(s)
671 *(if any) to get the current character.
672 */
673 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
674 /*decode the next byte */
675 byte_ptr++;
676
677 /*byte pattern must be: 10xx xxxx */
678 if ((*byte_ptr & 0xC0) != 0x80) {
679 return CR_ENCODING_ERROR;
680 }
681
682 c = (c << 6) | (*byte_ptr & 0x3F);
683 }
684
685 /*
686 *The decoded ucs4 char is now
687 *in c.
688 */
689
690 if (c <= 0xFF) { /*Add other conditions to support
691 *other char sets (ucs2, ucs3, ucs4).
692 */
693 len++;
694 } else {
695 /*the char is too long to fit
696 *into the supposed charset len.
697 */
698 return CR_ENCODING_ERROR;
699 }
700 }
701
702 *a_len = len;
703
704 return CR_OK;
705 }
706
707 /**
708 *Converts an utf8 string into an ucs4 string.
709 *@param a_in the input string to convert.
710 *@param a_in_len in/out parameter. The length of the input
711 *string. After return, points to the actual number of bytes
712 *consumed. This can be usefull to debug the input stream in case
713 *of encoding error.
714 *@param a_out out parameter. Points to the output string. It is allocated
715 *by this function and must be freed by the caller.
716 *@param a_out_len out parameter. The length of the output string.
717 *@return CR_OK upon successfull completion, an error code otherwise.
718 *
719 */
720 enum CRStatus
cr_utils_utf8_str_to_ucs4(const guchar * a_in,gulong * a_in_len,guint32 ** a_out,gulong * a_out_len)721 cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
722 gulong * a_in_len,
723 guint32 ** a_out, gulong * a_out_len)
724 {
725 enum CRStatus status = CR_OK;
726
727 g_return_val_if_fail (a_in && a_in_len
728 && a_out && a_out_len, CR_BAD_PARAM_ERROR);
729
730 status = cr_utils_utf8_str_len_as_ucs4 (a_in,
731 &a_in[*a_in_len - 1],
732 a_out_len);
733
734 g_return_val_if_fail (status == CR_OK, status);
735
736 *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
737
738 status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
739
740 return status;
741 }
742
743 /**
744 *Converts an ucs4 buffer into an utf8 buffer.
745 *
746 *@param a_in the input ucs4 buffer to convert.
747 *@param a_in_len in/out parameter. The size of the
748 *input buffer to convert. After return, this parameter contains
749 *the actual number of characters consumed.
750 *@param a_out the output converted utf8 buffer. Must be allocated by
751 *the caller.
752 *@param a_out_len in/out parameter. The size of the output buffer.
753 *If this size is actually smaller than the real needed size, the function
754 *just converts what it can and returns a success status. After return,
755 *this param points to the actual number of bytes in the buffer.
756 *@return CR_OK upon successfull completion, an error code otherwise.
757 */
758 enum CRStatus
cr_utils_ucs4_to_utf8(const guint32 * a_in,gulong * a_in_len,guchar * a_out,gulong * a_out_len)759 cr_utils_ucs4_to_utf8 (const guint32 * a_in,
760 gulong * a_in_len, guchar * a_out, gulong * a_out_len)
761 {
762 gulong in_len = 0,
763 in_index = 0,
764 out_index = 0;
765 enum CRStatus status = CR_OK;
766
767 g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
768 CR_BAD_PARAM_ERROR);
769
770 if (*a_in_len < 1) {
771 status = CR_OK;
772 goto end;
773 }
774
775 in_len = *a_in_len;
776
777 for (in_index = 0; in_index < in_len; in_index++) {
778 /*
779 *FIXME: return whenever we encounter forbidden char values.
780 */
781
782 if (a_in[in_index] <= 0x7F) {
783 a_out[out_index] = a_in[in_index];
784 out_index++;
785 } else if (a_in[in_index] <= 0x7FF) {
786 a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
787 a_out[out_index + 1] =
788 (0x80 | (a_in[in_index] & 0x3F));
789 out_index += 2;
790 } else if (a_in[in_index] <= 0xFFFF) {
791 a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
792 a_out[out_index + 1] =
793 (0x80 | ((a_in[in_index] >> 6) & 0x3F));
794 a_out[out_index + 2] =
795 (0x80 | (a_in[in_index] & 0x3F));
796 out_index += 3;
797 } else if (a_in[in_index] <= 0x1FFFFF) {
798 a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
799 a_out[out_index + 1]
800 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
801 a_out[out_index + 2]
802 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
803 a_out[out_index + 3]
804 = (0x80 | (a_in[in_index] & 0x3F));
805 out_index += 4;
806 } else if (a_in[in_index] <= 0x3FFFFFF) {
807 a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
808 a_out[out_index + 1] =
809 (0x80 | (a_in[in_index] >> 18));
810 a_out[out_index + 2]
811 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
812 a_out[out_index + 3]
813 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
814 a_out[out_index + 4]
815 = (0x80 | (a_in[in_index] & 0x3F));
816 out_index += 5;
817 } else if (a_in[in_index] <= 0x7FFFFFFF) {
818 a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
819 a_out[out_index + 1] =
820 (0x80 | (a_in[in_index] >> 24));
821 a_out[out_index + 2]
822 = (0x80 | ((a_in[in_index] >> 18) & 0x3F));
823 a_out[out_index + 3]
824 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
825 a_out[out_index + 4]
826 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
827 a_out[out_index + 4]
828 = (0x80 | (a_in[in_index] & 0x3F));
829 out_index += 6;
830 } else {
831 status = CR_ENCODING_ERROR;
832 goto end;
833 }
834 } /*end for */
835
836 end:
837 *a_in_len = in_index + 1;
838 *a_out_len = out_index + 1;
839
840 return status;
841 }
842
843 /**
844 *Converts an ucs4 string into an utf8 string.
845 *@param a_in the input string to convert.
846 *@param a_in_len in/out parameter. The length of the input
847 *string. After return, points to the actual number of characters
848 *consumed. This can be usefull to debug the input string in case
849 *of encoding error.
850 *@param a_out out parameter. Points to the output string. It is allocated
851 *by this function and must be freed by the caller.
852 *@param a_out_len out parameter. The length (in bytes) of the output string.
853 *@return CR_OK upon successfull completion, an error code otherwise.
854 */
855 enum CRStatus
cr_utils_ucs4_str_to_utf8(const guint32 * a_in,gulong * a_in_len,guchar ** a_out,gulong * a_out_len)856 cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
857 gulong * a_in_len,
858 guchar ** a_out, gulong * a_out_len)
859 {
860 enum CRStatus status = CR_OK;
861
862 g_return_val_if_fail (a_in && a_in_len && a_out
863 && a_out_len, CR_BAD_PARAM_ERROR);
864
865 status = cr_utils_ucs4_str_len_as_utf8 (a_in,
866 &a_in[*a_out_len - 1],
867 a_out_len);
868
869 g_return_val_if_fail (status == CR_OK, status);
870
871 status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
872
873 return status;
874 }
875
876 /**
877 *Converts an ucs1 buffer into an utf8 buffer.
878 *The caller must know the size of the resulting buffer and
879 *allocate it prior to calling this function.
880 *
881 *@param a_in the input ucs1 buffer.
882 *
883 *@param a_in_len in/out parameter. The length of the input buffer.
884 *After return, points to the number of bytes actually consumed even
885 *in case of encoding error.
886 *
887 *@param a_out out parameter. The output utf8 converted buffer.
888 *
889 *@param a_out_len in/out parameter. The size of the output buffer.
890 *If the output buffer size is shorter than the actual needed size,
891 *this function just convert what it can.
892 *
893 *@return CR_OK upon successfull completion, an error code otherwise.
894 *
895 */
896 enum CRStatus
cr_utils_ucs1_to_utf8(const guchar * a_in,gulong * a_in_len,guchar * a_out,gulong * a_out_len)897 cr_utils_ucs1_to_utf8 (const guchar * a_in,
898 gulong * a_in_len, guchar * a_out, gulong * a_out_len)
899 {
900 gulong out_index = 0,
901 in_index = 0,
902 in_len = 0,
903 out_len = 0;
904 enum CRStatus status = CR_OK;
905
906 g_return_val_if_fail (a_in && a_in_len
907 && a_out_len,
908 CR_BAD_PARAM_ERROR);
909
910 if (*a_in_len == 0) {
911 *a_out_len = 0 ;
912 return status;
913 }
914 g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
915
916 in_len = *a_in_len;
917 out_len = *a_out_len;
918
919 for (in_index = 0, out_index = 0;
920 (in_index < in_len) && (out_index < out_len); in_index++) {
921 /*
922 *FIXME: return whenever we encounter forbidden char values.
923 */
924
925 if (a_in[in_index] <= 0x7F) {
926 a_out[out_index] = a_in[in_index];
927 out_index++;
928 } else {
929 a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
930 a_out[out_index + 1] =
931 (0x80 | (a_in[in_index] & 0x3F));
932 out_index += 2;
933 }
934 } /*end for */
935
936 *a_in_len = in_index;
937 *a_out_len = out_index;
938
939 return status;
940 }
941
942 /**
943 *Converts an ucs1 string into an utf8 string.
944 *@param a_in_start the beginning of the input string to convert.
945 *@param a_in_end the end of the input string to convert.
946 *@param a_out out parameter. The converted string.
947 *@param a_out out parameter. The length of the converted string.
948 *@return CR_OK upon successfull completion, an error code otherwise.
949 *
950 */
951 enum CRStatus
cr_utils_ucs1_str_to_utf8(const guchar * a_in,gulong * a_in_len,guchar ** a_out,gulong * a_out_len)952 cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
953 gulong * a_in_len,
954 guchar ** a_out, gulong * a_out_len)
955 {
956 gulong out_len = 0;
957 enum CRStatus status = CR_OK;
958
959 g_return_val_if_fail (a_in && a_in_len && a_out
960 && a_out_len, CR_BAD_PARAM_ERROR);
961
962 if (*a_in_len < 1) {
963 *a_out_len = 0;
964 *a_out = NULL;
965 return CR_OK;
966 }
967
968 status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
969 &out_len);
970
971 g_return_val_if_fail (status == CR_OK, status);
972
973 *a_out = g_malloc0 (out_len);
974
975 status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
976
977 *a_out_len = out_len;
978
979 return status;
980 }
981
982 /**
983 *Converts an utf8 buffer into an ucs1 buffer.
984 *The caller must know the size of the resulting
985 *converted buffer, and allocated it prior to calling this
986 *function.
987 *
988 *@param a_in the input utf8 buffer to convert.
989 *
990 *@param a_in_len in/out parameter. The size of the input utf8 buffer.
991 *After return, points to the number of bytes consumed
992 *by the function even in case of encoding error.
993 *
994 *@param a_out out parameter. Points to the resulting buffer.
995 *Must be allocated by the caller. If the size of a_out is shorter
996 *than its required size, this function converts what it can and return
997 *a successfull status.
998 *
999 *@param a_out_len in/out parameter. The size of the output buffer.
1000 *After return, points to the number of bytes consumed even in case of
1001 *encoding error.
1002 *
1003 *@return CR_OK upon successfull completion, an error code otherwise.
1004 */
1005 enum CRStatus
cr_utils_utf8_to_ucs1(const guchar * a_in,gulong * a_in_len,guchar * a_out,gulong * a_out_len)1006 cr_utils_utf8_to_ucs1 (const guchar * a_in,
1007 gulong * a_in_len, guchar * a_out, gulong * a_out_len)
1008 {
1009 gulong in_index = 0,
1010 out_index = 0,
1011 in_len = 0,
1012 out_len = 0;
1013 enum CRStatus status = CR_OK;
1014
1015 /*
1016 *to store the final decoded
1017 *unicode char
1018 */
1019 guint32 c = 0;
1020
1021 g_return_val_if_fail (a_in && a_in_len
1022 && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1023
1024 if (*a_in_len < 1) {
1025 goto end;
1026 }
1027
1028 in_len = *a_in_len;
1029 out_len = *a_out_len;
1030
1031 for (in_index = 0, out_index = 0;
1032 (in_index < in_len) && (out_index < out_len);
1033 in_index++, out_index++) {
1034 gint nb_bytes_2_decode = 0;
1035
1036 if (a_in[in_index] <= 0x7F) {
1037 /*
1038 *7 bits long char
1039 *encoded over 1 byte:
1040 * 0xxx xxxx
1041 */
1042 c = a_in[in_index];
1043 nb_bytes_2_decode = 1;
1044
1045 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
1046 /*
1047 *up to 11 bits long char.
1048 *encoded over 2 bytes:
1049 *110x xxxx 10xx xxxx
1050 */
1051 c = a_in[in_index] & 0x1F;
1052 nb_bytes_2_decode = 2;
1053
1054 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
1055 /*
1056 *up to 16 bit long char
1057 *encoded over 3 bytes:
1058 *1110 xxxx 10xx xxxx 10xx xxxx
1059 */
1060 c = a_in[in_index] & 0x0F;
1061 nb_bytes_2_decode = 3;
1062
1063 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
1064 /*
1065 *up to 21 bits long char
1066 *encoded over 4 bytes:
1067 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
1068 */
1069 c = a_in[in_index] & 0x7;
1070 nb_bytes_2_decode = 4;
1071
1072 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
1073 /*
1074 *up to 26 bits long char
1075 *encoded over 5 bytes.
1076 *1111 10xx 10xx xxxx 10xx xxxx
1077 *10xx xxxx 10xx xxxx
1078 */
1079 c = a_in[in_index] & 3;
1080 nb_bytes_2_decode = 5;
1081
1082 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
1083 /*
1084 *up to 31 bits long char
1085 *encoded over 6 bytes:
1086 *1111 110x 10xx xxxx 10xx xxxx
1087 *10xx xxxx 10xx xxxx 10xx xxxx
1088 */
1089 c = a_in[in_index] & 1;
1090 nb_bytes_2_decode = 6;
1091
1092 } else {
1093 /*BAD ENCODING */
1094 status = CR_ENCODING_ERROR;
1095 goto end;
1096 }
1097
1098 /*
1099 *Go and decode the remaining byte(s)
1100 *(if any) to get the current character.
1101 */
1102 if (in_index + nb_bytes_2_decode - 1 >= in_len) {
1103 goto end;
1104 }
1105
1106 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
1107 /*decode the next byte */
1108 in_index++;
1109
1110 /*byte pattern must be: 10xx xxxx */
1111 if ((a_in[in_index] & 0xC0) != 0x80) {
1112 status = CR_ENCODING_ERROR;
1113 goto end;
1114 }
1115
1116 c = (c << 6) | (a_in[in_index] & 0x3F);
1117 }
1118
1119 /*
1120 *The decoded ucs4 char is now
1121 *in c.
1122 */
1123
1124 if (c > 0xFF) {
1125 status = CR_ENCODING_ERROR;
1126 goto end;
1127 }
1128
1129 a_out[out_index] = c;
1130 }
1131
1132 end:
1133 *a_out_len = out_index;
1134 *a_in_len = in_index;
1135
1136 return status;
1137 }
1138
1139 /**
1140 *Converts an utf8 buffer into an
1141 *ucs1 buffer.
1142 *@param a_in_start the start of the input buffer.
1143 *@param a_in_end the end of the input buffer.
1144 *@param a_out out parameter. The resulting converted ucs4 buffer.
1145 *Must be freed by the caller.
1146 *@param a_out_len out parameter. The length of the converted buffer.
1147 *@return CR_OK upon successfull completion, an error code otherwise.
1148 *Note that out parameters are valid if and only if this function
1149 *returns CR_OK.
1150 */
1151 enum CRStatus
cr_utils_utf8_str_to_ucs1(const guchar * a_in,gulong * a_in_len,guchar ** a_out,gulong * a_out_len)1152 cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
1153 gulong * a_in_len,
1154 guchar ** a_out, gulong * a_out_len)
1155 {
1156 enum CRStatus status = CR_OK;
1157
1158 g_return_val_if_fail (a_in && a_in_len
1159 && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1160
1161 if (*a_in_len < 1) {
1162 *a_out_len = 0;
1163 *a_out = NULL;
1164 return CR_OK;
1165 }
1166
1167 status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
1168 a_out_len);
1169
1170 g_return_val_if_fail (status == CR_OK, status);
1171
1172 *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
1173
1174 status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
1175 return status;
1176 }
1177
1178 /*****************************************
1179 *CSS basic types identification utilities
1180 *****************************************/
1181
1182 /**
1183 *Returns TRUE if a_char is a white space as
1184 *defined in the css spec in chap 4.1.1.
1185 *
1186 *white-space ::= ' '| \t|\r|\n|\f
1187 *
1188 *@param a_char the character to test.
1189 *return TRUE if is a white space, false otherwise.
1190 */
1191 gboolean
cr_utils_is_white_space(guint32 a_char)1192 cr_utils_is_white_space (guint32 a_char)
1193 {
1194 switch (a_char) {
1195 case ' ':
1196 case '\t':
1197 case '\r':
1198 case '\n':
1199 case '\f':
1200 return TRUE;
1201 break;
1202 default:
1203 return FALSE;
1204 }
1205 }
1206
1207 /**
1208 *Returns true if the character is a newline
1209 *as defined in the css spec in the chap 4.1.1.
1210 *
1211 *nl ::= \n|\r\n|\r|\f
1212 *
1213 *@param a_char the character to test.
1214 *@return TRUE if the character is a newline, FALSE otherwise.
1215 */
1216 gboolean
cr_utils_is_newline(guint32 a_char)1217 cr_utils_is_newline (guint32 a_char)
1218 {
1219 switch (a_char) {
1220 case '\n':
1221 case '\r':
1222 case '\f':
1223 return TRUE;
1224 break;
1225 default:
1226 return FALSE;
1227 }
1228 }
1229
1230 /**
1231 *returns TRUE if the char is part of an hexa num char:
1232 *i.e hexa_char ::= [0-9A-F]
1233 */
1234 gboolean
cr_utils_is_hexa_char(guint32 a_char)1235 cr_utils_is_hexa_char (guint32 a_char)
1236 {
1237 if ((a_char >= '0' && a_char <= '9')
1238 || (a_char >= 'A' && a_char <= 'F')) {
1239 return TRUE;
1240 }
1241 return FALSE;
1242 }
1243
1244 /**
1245 *Returns true if the character is a nonascii
1246 *character (as defined in the css spec chap 4.1.1):
1247 *
1248 *nonascii ::= [^\0-\177]
1249 *
1250 *@param a_char the character to test.
1251 *@return TRUE if the character is a nonascii char,
1252 *FALSE otherwise.
1253 */
1254 gboolean
cr_utils_is_nonascii(guint32 a_char)1255 cr_utils_is_nonascii (guint32 a_char)
1256 {
1257 if (a_char <= 177) {
1258 return FALSE;
1259 }
1260
1261 return TRUE;
1262 }
1263
1264 /**
1265 *Dumps a character a_nb times on a file.
1266 *@param a_char the char to dump
1267 *@param a_fp the destination file pointer
1268 *@param a_nb the number of times a_char is to be dumped.
1269 */
1270 void
cr_utils_dump_n_chars(guchar a_char,FILE * a_fp,glong a_nb)1271 cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
1272 {
1273 glong i = 0;
1274
1275 for (i = 0; i < a_nb; i++) {
1276 fprintf (a_fp, "%c", a_char);
1277 }
1278 }
1279
1280 void
cr_utils_dump_n_chars2(guchar a_char,GString * a_string,glong a_nb)1281 cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
1282 {
1283 glong i = 0;
1284
1285 g_return_if_fail (a_string);
1286
1287 for (i = 0; i < a_nb; i++) {
1288 g_string_append_printf (a_string, "%c", a_char);
1289 }
1290 }
1291
1292 /**
1293 *Duplicates a list of GString instances.
1294 *@return the duplicated list of GString instances or NULL if
1295 *something bad happened.
1296 *@param a_list_of_strings the list of strings to be duplicated.
1297 */
1298 GList *
cr_utils_dup_glist_of_string(GList const * a_list_of_strings)1299 cr_utils_dup_glist_of_string (GList const * a_list_of_strings)
1300 {
1301 GList const *cur = NULL;
1302 GList *result = NULL;
1303
1304 g_return_val_if_fail (a_list_of_strings, NULL);
1305
1306 for (cur = a_list_of_strings; cur; cur = cur->next) {
1307 GString *str = NULL;
1308
1309 str = g_string_new_len (((GString *) cur->data)->str,
1310 ((GString *) cur->data)->len);
1311 if (str)
1312 result = g_list_append (result, str);
1313 }
1314
1315 return result;
1316 }
1317
1318 /**
1319 *Duplicate a GList where the GList::data is a CRString.
1320 *@param a_list_of_strings the list to duplicate
1321 *@return the duplicated list, or NULL if something bad
1322 *happened.
1323 */
1324 GList *
cr_utils_dup_glist_of_cr_string(GList const * a_list_of_strings)1325 cr_utils_dup_glist_of_cr_string (GList const * a_list_of_strings)
1326 {
1327 GList const *cur = NULL;
1328 GList *result = NULL;
1329
1330 g_return_val_if_fail (a_list_of_strings, NULL);
1331
1332 for (cur = a_list_of_strings; cur; cur = cur->next) {
1333 CRString *str = NULL;
1334
1335 str = cr_string_dup ((CRString const *) cur->data) ;
1336 if (str)
1337 result = g_list_append (result, str);
1338 }
1339
1340 return result;
1341 }
1342