1 /*
2 * Copyright 2001-2004 Brandon Long
3 * All Rights Reserved.
4 *
5 * ClearSilver Templating System
6 *
7 * This code is made available under the terms of the ClearSilver License.
8 * http://www.clearsilver.net/license.hdf
9 *
10 */
11
12 #include "cs_config.h"
13
14 #include <stdlib.h>
15 #include <string.h>
16 #include <sys/types.h>
17 #include <regex.h>
18 #include <ctype.h>
19 #include "util/neo_misc.h"
20 #include "util/neo_err.h"
21 #include "util/neo_str.h"
22 #include "html.h"
23 #include "cgi.h"
24
has_space_formatting(const char * src,int slen)25 static int has_space_formatting(const char *src, int slen)
26 {
27 int spaces = 0;
28 int returns = 0;
29 int ascii_art = 0;
30 int x = 0;
31
32 for (x = 0; x < slen; x++)
33 {
34 if (src[x] == '\t') return 1;
35 if (src[x] == ' ')
36 {
37 spaces++;
38 if (x && (src[x-1] == '.'))
39 spaces--;
40 }
41 else if (src[x] == '\n')
42 {
43 spaces = 0;
44 returns++;
45 }
46 else if (strchr ("/\\<>:[]!@#$%^&*()|", src[x]))
47 {
48 ascii_art++;
49 if (ascii_art > 3) return 2;
50 }
51 else if (src[x] != '\r')
52 {
53 if (returns > 2) return 1;
54 if (spaces > 2) return 1;
55 returns = 0;
56 spaces = 0;
57 ascii_art = 0;
58 }
59 }
60
61 return 0;
62 }
63
64 /*
65 static int has_long_lines (char *s, int l)
66 {
67 char *ptr;
68 int x = 0;
69
70 while (x < l)
71 {
72 ptr = strchr (s + x, '\n');
73 if (ptr == NULL)
74 {
75 if (l - x > 75) return 1;
76 return 0;
77 }
78 if (ptr - (s + x) > 75) return 1;
79 x = ptr - s + 1;
80 }
81 return 0;
82 }
83 */
84
85 /* The first step is to actually find all of the URLs and email
86 * addresses using our handy regular expressions. We then mark these,
87 * and then go through convert non-special areas with straight
88 * text->html escapes, and convert special parts as special parts
89 */
90 struct _parts {
91 int begin;
92 int end;
93 int type;
94 };
95
96 #define SC_TYPE_TEXT 1
97 #define SC_TYPE_URL 2
98 #define SC_TYPE_EMAIL 3
99
100 static char *EmailRe = "[^][@:;<>\\\"()[:space:][:cntrl:]]+@[-+a-zA-Z0-9]+\\.[-+a-zA-Z0-9\\.]+[-+a-zA-Z0-9]";
101 static char *URLRe = "((http|https|ftp|mailto):(//)?[^[:space:]>\"\t]*|www\\.[-a-z0-9\\.]+)[^[:space:];\t\">]*";
102
split_and_convert(const char * src,int slen,STRING * out,HTML_CONVERT_OPTS * opts)103 static NEOERR *split_and_convert (const char *src, int slen,
104 STRING *out, HTML_CONVERT_OPTS *opts)
105 {
106 NEOERR *err = STATUS_OK;
107 static int compiled = 0;
108 static regex_t email_re, url_re;
109 regmatch_t email_match, url_match;
110 int errcode;
111 char *ptr, *esc;
112 char errbuf[256];
113 struct _parts *parts;
114 int part_count;
115 int part;
116 int x, i;
117 int spaces = 0;
118
119 if (!compiled)
120 {
121 if ((errcode = regcomp (&email_re, EmailRe, REG_ICASE | REG_EXTENDED)))
122 {
123 regerror (errcode, &email_re, errbuf, sizeof(errbuf));
124 return nerr_raise (NERR_PARSE, "Unable to compile EmailRE: %s", errbuf);
125 }
126 if ((errcode = regcomp (&url_re, URLRe, REG_ICASE | REG_EXTENDED)))
127 {
128 regerror (errcode, &url_re, errbuf, sizeof(errbuf));
129 return nerr_raise (NERR_PARSE, "Unable to compile URLRe: %s", errbuf);
130 }
131 compiled = 1;
132 }
133
134 part_count = 20;
135 parts = (struct _parts *) malloc (sizeof(struct _parts) * part_count);
136 part = 0;
137
138 x = 0;
139 if (regexec (&email_re, src+x, 1, &email_match, 0) != 0)
140 {
141 email_match.rm_so = -1;
142 email_match.rm_eo = -1;
143 }
144 else
145 {
146 email_match.rm_so += x;
147 email_match.rm_eo += x;
148 }
149 if (regexec (&url_re, src+x, 1, &url_match, 0) != 0)
150 {
151 url_match.rm_so = -1;
152 url_match.rm_eo = -1;
153 }
154 else
155 {
156 url_match.rm_so += x;
157 url_match.rm_eo += x;
158 }
159 while ((x < slen) && !((email_match.rm_so == -1) && (url_match.rm_so == -1)))
160 {
161 if (part >= part_count)
162 {
163 part_count *= 2;
164 parts = (struct _parts *) realloc (parts, sizeof(struct _parts) * part_count);
165 }
166 if ((url_match.rm_so != -1) && ((email_match.rm_so == -1) || (url_match.rm_so <= email_match.rm_so)))
167 {
168 parts[part].begin = url_match.rm_so;
169 parts[part].end = url_match.rm_eo;
170 parts[part].type = SC_TYPE_URL;
171 x = parts[part].end + 1;
172 part++;
173 if (x < slen)
174 {
175 if (regexec (&url_re, src+x, 1, &url_match, 0) != 0)
176 {
177 url_match.rm_so = -1;
178 url_match.rm_eo = -1;
179 }
180 else
181 {
182 url_match.rm_so += x;
183 url_match.rm_eo += x;
184 }
185 if ((email_match.rm_so != -1) && (x > email_match.rm_so))
186 {
187 if (regexec (&email_re, src+x, 1, &email_match, 0) != 0)
188 {
189 email_match.rm_so = -1;
190 email_match.rm_eo = -1;
191 }
192 else
193 {
194 email_match.rm_so += x;
195 email_match.rm_eo += x;
196 }
197 }
198 }
199 }
200 else
201 {
202 parts[part].begin = email_match.rm_so;
203 parts[part].end = email_match.rm_eo;
204 parts[part].type = SC_TYPE_EMAIL;
205 x = parts[part].end + 1;
206 part++;
207 if (x < slen)
208 {
209 if (regexec (&email_re, src+x, 1, &email_match, 0) != 0)
210 {
211 email_match.rm_so = -1;
212 email_match.rm_eo = -1;
213 }
214 else
215 {
216 email_match.rm_so += x;
217 email_match.rm_eo += x;
218 }
219 if ((url_match.rm_so != -1) && (x > url_match.rm_so))
220 {
221 if (regexec (&url_re, src+x, 1, &url_match, 0) != 0)
222 {
223 url_match.rm_so = -1;
224 url_match.rm_eo = -1;
225 }
226 else
227 {
228 url_match.rm_so += x;
229 url_match.rm_eo += x;
230 }
231 }
232 }
233 }
234 }
235
236 i = 0;
237 x = 0;
238 while (x < slen)
239 {
240 if ((i >= part) || (x < parts[i].begin))
241 {
242 ptr = strpbrk(src + x, "&<>\r\n ");
243 if (ptr == NULL)
244 {
245 if (spaces)
246 {
247 int sp;
248 for (sp = 0; sp < spaces - 1; sp++)
249 {
250 err = string_append (out, " ");
251 if (err != STATUS_OK) break;
252 }
253 if (err != STATUS_OK) break;
254 err = string_append_char (out, ' ');
255 }
256 spaces = 0;
257 if (i < part)
258 {
259 err = string_appendn (out, src + x, parts[i].begin - x);
260 x = parts[i].begin;
261 }
262 else
263 {
264 err = string_append (out, src + x);
265 x = slen;
266 }
267 }
268 else
269 {
270 if ((i >= part) || ((ptr - src) < parts[i].begin))
271 {
272 if (spaces)
273 {
274 int sp;
275 for (sp = 0; sp < spaces - 1; sp++)
276 {
277 err = string_append (out, " ");
278 if (err != STATUS_OK) break;
279 }
280 if (err != STATUS_OK) break;
281 err = string_append_char (out, ' ');
282 }
283 spaces = 0;
284 err = string_appendn (out, src + x, (ptr - src) - x);
285 if (err != STATUS_OK) break;
286 x = ptr - src;
287 if (src[x] == ' ')
288 {
289 if (opts->space_convert)
290 {
291 spaces++;
292 }
293 else
294 err = string_append_char (out, ' ');
295 }
296 else
297 {
298 if (src[x] != '\n' && spaces)
299 {
300 int sp;
301 for (sp = 0; sp < spaces - 1; sp++)
302 {
303 err = string_append (out, " ");
304 if (err != STATUS_OK) break;
305 }
306 if (err != STATUS_OK) break;
307 err = string_append_char (out, ' ');
308 }
309 spaces = 0;
310
311 if (src[x] == '&')
312 err = string_append (out, "&");
313 else if (src[x] == '<')
314 err = string_append (out, "<");
315 else if (src[x] == '>')
316 err = string_append (out, ">");
317 else if (src[x] == '\n')
318 if (opts->newlines_convert)
319 err = string_append (out, "<br/>\n");
320 else if (x && src[x-1] == '\n')
321 err = string_append (out, "<p/>\n");
322 else
323 err = string_append_char (out, '\n');
324 else if (src[x] != '\r')
325 err = nerr_raise (NERR_ASSERT, "src[x] == '%c'", src[x]);
326 }
327 x++;
328 }
329 else
330 {
331 if (spaces)
332 {
333 int sp;
334 for (sp = 0; sp < spaces - 1; sp++)
335 {
336 err = string_append (out, " ");
337 if (err != STATUS_OK) break;
338 }
339 if (err != STATUS_OK) break;
340 err = string_append_char (out, ' ');
341 }
342 spaces = 0;
343 err = string_appendn (out, src + x, parts[i].begin - x);
344 x = parts[i].begin;
345 }
346 }
347 }
348 else
349 {
350 if (spaces)
351 {
352 int sp;
353 for (sp = 0; sp < spaces - 1; sp++)
354 {
355 err = string_append (out, " ");
356 if (err != STATUS_OK) break;
357 }
358 if (err != STATUS_OK) break;
359 err = string_append_char (out, ' ');
360 }
361 spaces = 0;
362 if (parts[i].type == SC_TYPE_URL)
363 {
364 char last_char = src[parts[i].end-1];
365 int suffix=0;
366 if (last_char == '.' || last_char == ',') { suffix=1; }
367 err = string_append (out, " <a ");
368 if (err != STATUS_OK) break;
369 if (opts->url_class)
370 {
371 err = string_appendf (out, "class=%s ", opts->url_class);
372 if (err) break;
373 }
374 if (opts->url_target)
375 {
376 err = string_appendf (out, "target=\"%s\" ", opts->url_target);
377 if (err) break;
378 }
379 err = string_append(out, "href=\"");
380 if (err) break;
381 if (opts->bounce_url)
382 {
383 char *url, *esc_url, *new_url;
384 int url_len;
385 if (!strncasecmp(src + x, "www.", 4))
386 {
387 url_len = 7 + parts[i].end - x - suffix;
388 url = (char *) malloc(url_len+1);
389 if (url == NULL)
390 {
391 err = nerr_raise(NERR_NOMEM,
392 "Unable to allocate memory to convert url");
393 break;
394 }
395 strcpy(url, "http://");
396 strncat(url, src + x, parts[i].end - x - suffix);
397 }
398 else
399 {
400 url_len = parts[i].end - x - suffix;
401 url = (char *) malloc(url_len+1);
402 if (url == NULL)
403 {
404 err = nerr_raise(NERR_NOMEM,
405 "Unable to allocate memory to convert url");
406 break;
407 }
408 strncpy(url, src + x, parts[i].end - x - suffix);
409 url[url_len] = '\0';
410 }
411 err = cgi_url_escape(url, &esc_url);
412 free(url);
413 if (err) {
414 free(esc_url);
415 break;
416 }
417
418 new_url = sprintf_alloc(opts->bounce_url, esc_url);
419 free(esc_url);
420 if (new_url == NULL)
421 {
422 err = nerr_raise(NERR_NOMEM, "Unable to allocate memory to convert url");
423 break;
424 }
425 err = string_append (out, new_url);
426 free(new_url);
427 if (err) break;
428 }
429 else
430 {
431 if (!strncasecmp(src + x, "www.", 4))
432 {
433 err = string_append (out, "http://");
434 if (err != STATUS_OK) break;
435 }
436 err = string_appendn (out, src + x, parts[i].end - x - suffix);
437 if (err != STATUS_OK) break;
438 }
439 err = string_append (out, "\">");
440 if (err != STATUS_OK) break;
441 if (opts->link_name) {
442 err = html_escape_alloc((opts->link_name),
443 strlen(opts->link_name), &esc);
444 } else {
445 err = html_escape_alloc((src + x), parts[i].end - x - suffix, &esc);
446 }
447 if (err != STATUS_OK) break;
448 err = string_append (out, esc);
449 free(esc);
450 if (err != STATUS_OK) break;
451 err = string_append (out, "</a>");
452 if (suffix) {
453 err = string_appendn(out,src + parts[i].end - 1,1);
454 if (err != STATUS_OK) break;
455 }
456 }
457 else /* type == SC_TYPE_EMAIL */
458 {
459 err = string_append (out, "<a ");
460 if (err != STATUS_OK) break;
461 if (opts->mailto_class)
462 {
463 err = string_appendf (out, "class=%s ", opts->mailto_class);
464 if (err) break;
465 }
466 err = string_append(out, "href=\"mailto:");
467 if (err) break;
468 err = string_appendn (out, src + x, parts[i].end - x);
469 if (err != STATUS_OK) break;
470 err = string_append (out, "\">");
471 if (err != STATUS_OK) break;
472 err = html_escape_alloc(src + x, parts[i].end - x, &esc);
473 if (err != STATUS_OK) break;
474 err = string_append (out, esc);
475 free(esc);
476 if (err != STATUS_OK) break;
477 err = string_append (out, "</a>");
478 }
479 x = parts[i].end;
480 i++;
481 }
482 if (err != STATUS_OK) break;
483 }
484 free (parts);
485 return err;
486 }
487
strip_white_space_end(STRING * str)488 static void strip_white_space_end (STRING *str)
489 {
490 int x = 0;
491 int ol = str->len;
492 char *ptr;
493 int i;
494
495 while (x < str->len)
496 {
497 ptr = strchr(str->buf + x, '\n');
498 if (ptr == NULL)
499 {
500 /* just strip the white space at the end of the string */
501 ol = strlen(str->buf);
502 while (ol && isspace(str->buf[ol-1]))
503 {
504 str->buf[ol - 1] = '\0';
505 ol--;
506 }
507 str->len = ol;
508 return;
509 }
510 else
511 {
512 x = i = ptr - str->buf;
513 if (x)
514 {
515 x--;
516 while (x && isspace(str->buf[x]) && (str->buf[x] != '\n')) x--;
517 if (x) x++;
518 memmove (str->buf + x, ptr, ol - i + 1);
519 x++;
520 str->len -= ((i - x) + 1);
521 str->buf[str->len] = '\0';
522 ol = str->len;
523 }
524 }
525 }
526 }
527
convert_text_html_alloc(const char * src,int slen,char ** out)528 NEOERR *convert_text_html_alloc (const char *src, int slen,
529 char **out)
530 {
531 return nerr_pass(convert_text_html_alloc_options(src, slen, out, NULL));
532 }
533
convert_text_html_alloc_options(const char * src,int slen,char ** out,HTML_CONVERT_OPTS * opts)534 NEOERR *convert_text_html_alloc_options (const char *src, int slen,
535 char **out,
536 HTML_CONVERT_OPTS *opts)
537 {
538 NEOERR *err;
539 STRING out_s;
540 int formatting = 0;
541 HTML_CONVERT_OPTS my_opts;
542
543 string_init(&out_s);
544
545 if (opts == NULL)
546 {
547 opts = &my_opts;
548 opts->bounce_url = NULL;
549 opts->url_class = NULL;
550 opts->url_target = "_blank";
551 opts->mailto_class = NULL;
552 opts->long_lines = 0;
553 opts->space_convert = 0;
554 opts->newlines_convert = 1;
555 opts->longline_width = 75; /* This hasn't been used in a while, actually */
556 opts->check_ascii_art = 1;
557 opts->link_name = NULL;
558 }
559
560 do
561 {
562 if (opts->check_ascii_art)
563 {
564 formatting = has_space_formatting (src, slen);
565 if (formatting) opts->space_convert = 1;
566 }
567 if (formatting == 2)
568 {
569 /* Do <pre> formatting */
570 opts->newlines_convert = 1;
571 err = string_append (&out_s, "<tt>");
572 if (err != STATUS_OK) break;
573 err = split_and_convert(src, slen, &out_s, opts);
574 if (err != STATUS_OK) break;
575 err = string_append (&out_s, "</tt>");
576 if (err != STATUS_OK) break;
577 /* Strip white space at end of lines */
578 strip_white_space_end (&out_s);
579 }
580 else
581 {
582 /* int nl = has_long_lines (src, slen); */
583 err = split_and_convert(src, slen, &out_s, opts);
584 }
585 } while (0);
586 if (err != STATUS_OK)
587 {
588 string_clear (&out_s);
589 return nerr_pass (err);
590 }
591 if (out_s.buf == NULL)
592 {
593 *out = strdup("");
594 }
595 else
596 {
597 *out = out_s.buf;
598 }
599 return STATUS_OK;
600 }
601
html_escape_alloc(const char * src,int slen,char ** out)602 NEOERR *html_escape_alloc (const char *src, int slen,
603 char **out)
604 {
605 return nerr_pass(neos_html_escape(src, slen, out));
606 }
607
608 /* Replace ampersand with iso-8859-1 character code */
_expand_amp_8859_1_char(const char * s)609 static unsigned char _expand_amp_8859_1_char (const char *s)
610 {
611 if (s[0] == '\0')
612 return 0;
613
614 switch (s[0]) {
615 case '#':
616 if (s[1] == 'x') return strtol (s+2, NULL, 16);
617 return strtol (s+1, NULL, 10);
618 case 'a':
619 if (!strcmp(s, "agrave")) return 0xe0; /* � */
620 if (!strcmp(s, "aacute")) return 0xe1; /* � */
621 if (!strcmp(s, "acirc")) return 0xe2; /* � */
622 if (!strcmp(s, "atilde")) return 0xe3; /* � */
623 if (!strcmp(s, "auml")) return 0xe4; /* � */
624 if (!strcmp(s, "aring")) return 0xe5; /* � */
625 if (!strcmp(s, "aelig")) return 0xe6; /* � */
626 if (!strcmp(s, "amp")) return '&';
627 return 0;
628 case 'c':
629 if (!strcmp(s, "ccedil")) return 0xe7; /* � */
630 return 0;
631 case 'e':
632 if (!strcmp(s, "egrave")) return 0xe8; /* � */
633 if (!strcmp(s, "eacute")) return 0xe9; /* � */
634 if (!strcmp(s, "ecirc")) return 0xea; /* � */
635 if (!strcmp(s, "euml")) return 0xeb; /* � */
636 if (!strcmp(s, "eth")) return 0xf0; /* � */
637 return 0;
638 case 'i':
639 if (!strcmp(s, "igrave")) return 0xec; /* � */
640 if (!strcmp(s, "iacute")) return 0xed; /* � */
641 if (!strcmp(s, "icirc")) return 0xee; /* � */
642 if (!strcmp(s, "iuml")) return 0xef; /* � */
643 return 0;
644 case 'g':
645 if (!strcmp(s, "gt")) return '>';
646 return 0;
647 case 'l':
648 if (!strcmp(s, "lt")) return '<';
649 return 0;
650 case 'n':
651 if (!strcmp(s, "ntilde")) return 0xf1; /* � */
652 if (!strcmp(s, "nbsp")) return ' ';
653 return 0;
654 case 'o':
655 if (!strcmp(s, "ograve")) return 0xf2; /* � */
656 if (!strcmp(s, "oacute")) return 0xf3; /* � */
657 if (!strcmp(s, "ocirc")) return 0xf4; /* � */
658 if (!strcmp(s, "otilde")) return 0xf5; /* � */
659 if (!strcmp(s, "ouml")) return 0xf6; /* � */
660 if (!strcmp(s, "oslash")) return 0xf8; /* � */
661 return 0;
662 case 'q': /* quot */
663 if (!strcmp(s, "quot")) return '"';
664 return 0;
665 case 's':
666 if (!strcmp(s, "szlig")) return 0xdf; /* � */
667 return 0;
668 case 't':
669 if (!strcmp(s, "thorn")) return 0xfe; /* � */
670 return 0;
671 case 'u':
672 if (!strcmp(s, "ugrave")) return 0xf9; /* � */
673 if (!strcmp(s, "uacute")) return 0xfa; /* � */
674 if (!strcmp(s, "ucirc")) return 0xfb; /* � */
675 if (!strcmp(s, "uuml")) return 0xfc; /* � */
676 return 0;
677 case 'y':
678 if (!strcmp(s, "yacute")) return 0xfd; /* � */
679
680 }
681 return 0;
682 }
683
html_expand_amp_8859_1(const char * amp,char * buf)684 char *html_expand_amp_8859_1(const char *amp,
685 char *buf)
686 {
687 unsigned char ch;
688
689 ch = _expand_amp_8859_1_char(amp);
690 if (ch == '\0')
691 {
692 if (!strcmp(amp, "copy")) return "(C)";
693 return "";
694 }
695 else {
696 buf[0] = (char)ch;
697 buf[1] = '\0';
698 return buf;
699 }
700 }
701
html_strip_alloc(const char * src,int slen,char ** out)702 NEOERR *html_strip_alloc(const char *src, int slen,
703 char **out)
704 {
705 NEOERR *err = STATUS_OK;
706 STRING out_s;
707 int x = 0;
708 int strip_match = -1;
709 int state = 0;
710 char amp[10];
711 int amp_start = 0;
712 char buf[10];
713 int ampl = 0;
714
715 string_init(&out_s);
716 err = string_append (&out_s, "");
717 if (err) return nerr_pass (err);
718
719 while (x < slen)
720 {
721 switch (state) {
722 case 0:
723 /* Default */
724 if (src[x] == '&')
725 {
726 state = 3;
727 ampl = 0;
728 amp_start = x;
729 }
730 else if (src[x] == '<')
731 {
732 state = 1;
733 }
734 else
735 {
736 if (strip_match == -1)
737 {
738 err = string_append_char(&out_s, src[x]);
739 if (err) break;
740 }
741 }
742 x++;
743 break;
744 case 1:
745 /* Starting TAG */
746 if (src[x] == '>')
747 {
748 state = 0;
749 }
750 else if (src[x] == '/')
751 {
752 }
753 else
754 {
755 }
756 x++;
757 break;
758 case 2:
759 /* In TAG */
760 if (src[x] == '>')
761 {
762 state = 0;
763 }
764 x++;
765 break;
766 case 3:
767 /* In AMP */
768 if (src[x] == ';')
769 {
770 amp[ampl] = '\0';
771 state = 0;
772 err = string_append(&out_s, html_expand_amp_8859_1(amp, buf));
773 if (err) break;
774 }
775 else
776 {
777 if (ampl < sizeof(amp)-1)
778 amp[ampl++] = tolower(src[x]);
779 else
780 {
781 /* broken html... just back up */
782 x = amp_start;
783 err = string_append_char(&out_s, src[x]);
784 if (err) break;
785 state = 0;
786 }
787 }
788 x++;
789 break;
790 }
791 if (err) break;
792 }
793
794
795 if (err)
796 {
797 string_clear (&out_s);
798 return nerr_pass (err);
799 }
800 *out = out_s.buf;
801 return STATUS_OK;
802 }
803