1 /* GStreamer SAMI subtitle parser
2 * Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
13 *
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17 * Boston, MA 02110-1301, USA.
18 */
19
20 #include "samiparse.h"
21
22 #include <glib.h>
23 #include <string.h>
24 #include <stdlib.h>
25
26 #define ITALIC_TAG 'i'
27 #define SPAN_TAG 's'
28 #define RUBY_TAG 'r'
29 #define RT_TAG 't'
30 #define CLEAR_TAG '0'
31
32 typedef struct _HtmlParser HtmlParser;
33 typedef struct _HtmlContext HtmlContext;
34 typedef struct _GstSamiContext GstSamiContext;
35
36 struct _GstSamiContext
37 {
38 GString *buf; /* buffer to collect content */
39 GString *rubybuf; /* buffer to collect ruby content */
40 GString *resultbuf; /* when opening the next 'sync' tag, move
41 * from 'buf' to avoid to append following
42 * content */
43 GString *state; /* in many sami files there are tags that
44 * are not closed, so for each open tag the
45 * parser will append a tag flag here so
46 * that tags can be closed properly on
47 * 'sync' tags. See _context_push_state()
48 * and _context_pop_state(). */
49 HtmlContext *htmlctxt; /* html parser context */
50 gboolean has_result; /* set when ready to push out result */
51 gboolean in_sync; /* flag to avoid appending anything except the
52 * content of the sync elements to buf */
53 guint64 time1; /* previous start attribute in sync tag */
54 guint64 time2; /* current start attribute in sync tag */
55 };
56
57 struct _HtmlParser
58 {
59 void (*start_element) (HtmlContext * ctx,
60 const gchar * name, const gchar ** attr, gpointer user_data);
61 void (*end_element) (HtmlContext * ctx,
62 const gchar * name, gpointer user_data);
63 void (*text) (HtmlContext * ctx,
64 const gchar * text, gsize text_len, gpointer user_data);
65 };
66
67 struct _HtmlContext
68 {
69 const HtmlParser *parser;
70 gpointer user_data;
71 GString *buf;
72 };
73
74 static HtmlContext *
html_context_new(HtmlParser * parser,gpointer user_data)75 html_context_new (HtmlParser * parser, gpointer user_data)
76 {
77 HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
78 ctxt->parser = parser;
79 ctxt->user_data = user_data;
80 ctxt->buf = g_string_new (NULL);
81 return ctxt;
82 }
83
84 static void
html_context_free(HtmlContext * ctxt)85 html_context_free (HtmlContext * ctxt)
86 {
87 g_string_free (ctxt->buf, TRUE);
88 g_free (ctxt);
89 }
90
91 struct EntityMap
92 {
93 const gunichar unescaped;
94 const gchar *escaped;
95 };
96
97 struct EntityMap XmlEntities[] = {
98 {34, "quot;"},
99 {38, "amp;"},
100 {39, "apos;"},
101 {60, "lt;"},
102 {62, "gt;"},
103 {0, NULL},
104 };
105
106 struct EntityMap HtmlEntities[] = {
107 /* nbsp will handle manually
108 { 160, "nbsp;" }, */
109 {161, "iexcl;"},
110 {162, "cent;"},
111 {163, "pound;"},
112 {164, "curren;"},
113 {165, "yen;"},
114 {166, "brvbar;"},
115 {167, "sect;"},
116 {168, "uml;"},
117 {169, "copy;"},
118 {170, "ordf;"},
119 {171, "laquo;"},
120 {172, "not;"},
121 {173, "shy;"},
122 {174, "reg;"},
123 {175, "macr;"},
124 {176, "deg;"},
125 {177, "plusmn;"},
126 {178, "sup2;"},
127 {179, "sup3;"},
128 {180, "acute;"},
129 {181, "micro;"},
130 {182, "para;"},
131 {183, "middot;"},
132 {184, "cedil;"},
133 {185, "sup1;"},
134 {186, "ordm;"},
135 {187, "raquo;"},
136 {188, "frac14;"},
137 {189, "frac12;"},
138 {190, "frac34;"},
139 {191, "iquest;"},
140 {192, "Agrave;"},
141 {193, "Aacute;"},
142 {194, "Acirc;"},
143 {195, "Atilde;"},
144 {196, "Auml;"},
145 {197, "Aring;"},
146 {198, "AElig;"},
147 {199, "Ccedil;"},
148 {200, "Egrave;"},
149 {201, "Eacute;"},
150 {202, "Ecirc;"},
151 {203, "Euml;"},
152 {204, "Igrave;"},
153 {205, "Iacute;"},
154 {206, "Icirc;"},
155 {207, "Iuml;"},
156 {208, "ETH;"},
157 {209, "Ntilde;"},
158 {210, "Ograve;"},
159 {211, "Oacute;"},
160 {212, "Ocirc;"},
161 {213, "Otilde;"},
162 {214, "Ouml;"},
163 {215, "times;"},
164 {216, "Oslash;"},
165 {217, "Ugrave;"},
166 {218, "Uacute;"},
167 {219, "Ucirc;"},
168 {220, "Uuml;"},
169 {221, "Yacute;"},
170 {222, "THORN;"},
171 {223, "szlig;"},
172 {224, "agrave;"},
173 {225, "aacute;"},
174 {226, "acirc;"},
175 {227, "atilde;"},
176 {228, "auml;"},
177 {229, "aring;"},
178 {230, "aelig;"},
179 {231, "ccedil;"},
180 {232, "egrave;"},
181 {233, "eacute;"},
182 {234, "ecirc;"},
183 {235, "euml;"},
184 {236, "igrave;"},
185 {237, "iacute;"},
186 {238, "icirc;"},
187 {239, "iuml;"},
188 {240, "eth;"},
189 {241, "ntilde;"},
190 {242, "ograve;"},
191 {243, "oacute;"},
192 {244, "ocirc;"},
193 {245, "otilde;"},
194 {246, "ouml;"},
195 {247, "divide;"},
196 {248, "oslash;"},
197 {249, "ugrave;"},
198 {250, "uacute;"},
199 {251, "ucirc;"},
200 {252, "uuml;"},
201 {253, "yacute;"},
202 {254, "thorn;"},
203 {255, "yuml;"},
204 {338, "OElig;"},
205 {339, "oelig;"},
206 {352, "Scaron;"},
207 {353, "scaron;"},
208 {376, "Yuml;"},
209 {402, "fnof;"},
210 {710, "circ;"},
211 {732, "tilde;"},
212 {913, "Alpha;"},
213 {914, "Beta;"},
214 {915, "Gamma;"},
215 {916, "Delta;"},
216 {917, "Epsilon;"},
217 {918, "Zeta;"},
218 {919, "Eta;"},
219 {920, "Theta;"},
220 {921, "Iota;"},
221 {922, "Kappa;"},
222 {923, "Lambda;"},
223 {924, "Mu;"},
224 {925, "Nu;"},
225 {926, "Xi;"},
226 {927, "Omicron;"},
227 {928, "Pi;"},
228 {929, "Rho;"},
229 {931, "Sigma;"},
230 {932, "Tau;"},
231 {933, "Upsilon;"},
232 {934, "Phi;"},
233 {935, "Chi;"},
234 {936, "Psi;"},
235 {937, "Omega;"},
236 {945, "alpha;"},
237 {946, "beta;"},
238 {947, "gamma;"},
239 {948, "delta;"},
240 {949, "epsilon;"},
241 {950, "zeta;"},
242 {951, "eta;"},
243 {952, "theta;"},
244 {953, "iota;"},
245 {954, "kappa;"},
246 {955, "lambda;"},
247 {956, "mu;"},
248 {957, "nu;"},
249 {958, "xi;"},
250 {959, "omicron;"},
251 {960, "pi;"},
252 {961, "rho;"},
253 {962, "sigmaf;"},
254 {963, "sigma;"},
255 {964, "tau;"},
256 {965, "upsilon;"},
257 {966, "phi;"},
258 {967, "chi;"},
259 {968, "psi;"},
260 {969, "omega;"},
261 {977, "thetasym;"},
262 {978, "upsih;"},
263 {982, "piv;"},
264 {8194, "ensp;"},
265 {8195, "emsp;"},
266 {8201, "thinsp;"},
267 {8204, "zwnj;"},
268 {8205, "zwj;"},
269 {8206, "lrm;"},
270 {8207, "rlm;"},
271 {8211, "ndash;"},
272 {8212, "mdash;"},
273 {8216, "lsquo;"},
274 {8217, "rsquo;"},
275 {8218, "sbquo;"},
276 {8220, "ldquo;"},
277 {8221, "rdquo;"},
278 {8222, "bdquo;"},
279 {8224, "dagger;"},
280 {8225, "Dagger;"},
281 {8226, "bull;"},
282 {8230, "hellip;"},
283 {8240, "permil;"},
284 {8242, "prime;"},
285 {8243, "Prime;"},
286 {8249, "lsaquo;"},
287 {8250, "rsaquo;"},
288 {8254, "oline;"},
289 {8260, "frasl;"},
290 {8364, "euro;"},
291 {8465, "image;"},
292 {8472, "weierp;"},
293 {8476, "real;"},
294 {8482, "trade;"},
295 {8501, "alefsym;"},
296 {8592, "larr;"},
297 {8593, "uarr;"},
298 {8594, "rarr;"},
299 {8595, "darr;"},
300 {8596, "harr;"},
301 {8629, "crarr;"},
302 {8656, "lArr;"},
303 {8657, "uArr;"},
304 {8658, "rArr;"},
305 {8659, "dArr;"},
306 {8660, "hArr;"},
307 {8704, "forall;"},
308 {8706, "part;"},
309 {8707, "exist;"},
310 {8709, "empty;"},
311 {8711, "nabla;"},
312 {8712, "isin;"},
313 {8713, "notin;"},
314 {8715, "ni;"},
315 {8719, "prod;"},
316 {8721, "sum;"},
317 {8722, "minus;"},
318 {8727, "lowast;"},
319 {8730, "radic;"},
320 {8733, "prop;"},
321 {8734, "infin;"},
322 {8736, "ang;"},
323 {8743, "and;"},
324 {8744, "or;"},
325 {8745, "cap;"},
326 {8746, "cup;"},
327 {8747, "int;"},
328 {8756, "there4;"},
329 {8764, "sim;"},
330 {8773, "cong;"},
331 {8776, "asymp;"},
332 {8800, "ne;"},
333 {8801, "equiv;"},
334 {8804, "le;"},
335 {8805, "ge;"},
336 {8834, "sub;"},
337 {8835, "sup;"},
338 {8836, "nsub;"},
339 {8838, "sube;"},
340 {8839, "supe;"},
341 {8853, "oplus;"},
342 {8855, "otimes;"},
343 {8869, "perp;"},
344 {8901, "sdot;"},
345 {8968, "lceil;"},
346 {8969, "rceil;"},
347 {8970, "lfloor;"},
348 {8971, "rfloor;"},
349 {9001, "lang;"},
350 {9002, "rang;"},
351 {9674, "loz;"},
352 {9824, "spades;"},
353 {9827, "clubs;"},
354 {9829, "hearts;"},
355 {9830, "diams;"},
356 {0, NULL},
357 };
358
359 static gchar *
unescape_string(const gchar * text)360 unescape_string (const gchar * text)
361 {
362 gint i;
363 GString *unescaped = g_string_new (NULL);
364
365 while (*text) {
366 if (*text == '&') {
367 text++;
368
369 /* unescape   and */
370 if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
371 unescaped = g_string_append_unichar (unescaped, 160);
372 text += 4;
373 if (*text == ';') {
374 text++;
375 }
376 goto next;
377 }
378
379 /* pass xml entities. these will be processed as pango markup */
380 for (i = 0; XmlEntities[i].escaped; i++) {
381 gssize len = strlen (XmlEntities[i].escaped);
382 if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) {
383 unescaped = g_string_append_c (unescaped, '&');
384 unescaped =
385 g_string_append_len (unescaped, XmlEntities[i].escaped, len);
386 text += len;
387 goto next;
388 }
389 }
390
391 /* convert html entities */
392 for (i = 0; HtmlEntities[i].escaped; i++) {
393 gssize len = strlen (HtmlEntities[i].escaped);
394 if (!strncmp (text, HtmlEntities[i].escaped, len)) {
395 unescaped =
396 g_string_append_unichar (unescaped, HtmlEntities[i].unescaped);
397 text += len;
398 goto next;
399 }
400 }
401
402 if (*text == '#') {
403 gboolean is_hex = FALSE;
404 gunichar l;
405 gchar *end = NULL;
406
407 text++;
408 if (*text == 'x') {
409 is_hex = TRUE;
410 text++;
411 }
412 errno = 0;
413 if (is_hex) {
414 l = strtoul (text, &end, 16);
415 } else {
416 l = strtoul (text, &end, 10);
417 }
418
419 if (text == end || errno != 0) {
420 /* error occured. pass it */
421 goto next;
422 }
423 unescaped = g_string_append_unichar (unescaped, l);
424 text = end;
425
426 if (*text == ';') {
427 text++;
428 }
429 goto next;
430 }
431
432 /* escape & */
433 unescaped = g_string_append (unescaped, "&");
434
435 next:
436 continue;
437
438 } else if (g_ascii_isspace (*text)) {
439 unescaped = g_string_append_c (unescaped, ' ');
440 /* strip whitespace */
441 do {
442 text++;
443 } while ((*text) && g_ascii_isspace (*text));
444 } else {
445 unescaped = g_string_append_c (unescaped, *text);
446 text++;
447 }
448 }
449
450 return g_string_free (unescaped, FALSE);
451 }
452
453 static const gchar *
string_token(const gchar * string,const gchar * delimiter,gchar ** first)454 string_token (const gchar * string, const gchar * delimiter, gchar ** first)
455 {
456 gchar *next = strstr (string, delimiter);
457 if (next) {
458 *first = g_strndup (string, next - string);
459 } else {
460 *first = g_strdup (string);
461 }
462 return next;
463 }
464
465 static void
html_context_handle_element(HtmlContext * ctxt,const gchar * string,gboolean must_close)466 html_context_handle_element (HtmlContext * ctxt,
467 const gchar * string, gboolean must_close)
468 {
469 gchar *name = NULL;
470 gint count = 0, i;
471 gchar **attrs;
472 const gchar *found, *next;
473
474 /* split element name and attributes */
475 next = string_token (string, " ", &name);
476
477 if (next) {
478 /* count attributes */
479 found = next + 1;
480 while (TRUE) {
481 found = strchr (found, '=');
482 if (!found)
483 break;
484 found++;
485 count++;
486 }
487 } else {
488 count = 0;
489 }
490
491 attrs = g_new0 (gchar *, (count + 1) * 2);
492
493 for (i = 0; i < count && next != NULL; i += 2) {
494 gchar *attr_name = NULL, *attr_value = NULL;
495 gsize length;
496 next = string_token (next + 1, "=", &attr_name);
497 next = string_token (next + 1, " ", &attr_value);
498
499 /* strip " or ' from attribute value */
500 if (attr_value[0] == '"' || attr_value[0] == '\'') {
501 gchar *tmp = g_strdup (attr_value + 1);
502 g_free (attr_value);
503 attr_value = tmp;
504 }
505
506 length = strlen (attr_value);
507 if (length > 0 && (attr_value[length - 1] == '"'
508 || attr_value[length - 1] == '\'')) {
509 attr_value[length - 1] = '\0';
510 }
511
512 attrs[i] = attr_name;
513 attrs[i + 1] = attr_value;
514 }
515
516 ctxt->parser->start_element (ctxt, name,
517 (const gchar **) attrs, ctxt->user_data);
518 if (must_close) {
519 ctxt->parser->end_element (ctxt, name, ctxt->user_data);
520 }
521 g_strfreev (attrs);
522 g_free (name);
523 }
524
525 static void
html_context_parse(HtmlContext * ctxt,gchar * text,gsize text_len)526 html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
527 {
528 const gchar *next = NULL;
529 ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
530 next = ctxt->buf->str;
531 while (TRUE) {
532 if (next[0] == '<') {
533 gchar *element = NULL;
534 /* find <blahblah> */
535 if (!strchr (next, '>')) {
536 /* no tag end point. buffer will be process in next time */
537 return;
538 }
539
540 next = string_token (next, ">", &element);
541 next++;
542 if (g_str_has_suffix (next, "/")) {
543 /* handle <blah/> */
544 element[strlen (element) - 1] = '\0';
545 html_context_handle_element (ctxt, element + 1, TRUE);
546 } else if (element[1] == '/') {
547 /* handle </blah> */
548 ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
549 } else {
550 /* handle <blah> */
551 html_context_handle_element (ctxt, element + 1, FALSE);
552 }
553 g_free (element);
554 } else if (strchr (next, '<')) {
555 gchar *text = NULL;
556 gsize length;
557 next = string_token (next, "<", &text);
558 text = g_strstrip (text);
559 length = strlen (text);
560 ctxt->parser->text (ctxt, text, length, ctxt->user_data);
561 g_free (text);
562
563 } else {
564 gchar *text = (gchar *) next;
565 gsize length;
566 text = g_strstrip (text);
567 length = strlen (text);
568 ctxt->parser->text (ctxt, text, length, ctxt->user_data);
569 ctxt->buf = g_string_assign (ctxt->buf, "");
570 return;
571 }
572 }
573
574 ctxt->buf = g_string_assign (ctxt->buf, next);
575 }
576
577 static gchar *
has_tag(GString * str,const gchar tag)578 has_tag (GString * str, const gchar tag)
579 {
580 return strrchr (str->str, tag);
581 }
582
583 static void
sami_context_push_state(GstSamiContext * sctx,char state)584 sami_context_push_state (GstSamiContext * sctx, char state)
585 {
586 GST_LOG ("state %c", state);
587 g_string_append_c (sctx->state, state);
588 }
589
590 static void
sami_context_pop_state(GstSamiContext * sctx,char state)591 sami_context_pop_state (GstSamiContext * sctx, char state)
592 {
593 GString *str = g_string_new ("");
594 GString *context_state = sctx->state;
595 int i;
596
597 GST_LOG ("state %c", state);
598 for (i = context_state->len - 1; i >= 0; i--) {
599 switch (context_state->str[i]) {
600 case ITALIC_TAG: /* <i> */
601 {
602 g_string_append (str, "</i>");
603 break;
604 }
605 case SPAN_TAG: /* <span foreground= > */
606 {
607 g_string_append (str, "</span>");
608 break;
609 }
610 case RUBY_TAG: /* <span size= > -- ruby */
611 {
612 break;
613 }
614 case RT_TAG: /* ruby */
615 {
616 /* FIXME: support for furigana/ruby once implemented in pango */
617 g_string_append (sctx->rubybuf, "</span>");
618 if (has_tag (context_state, ITALIC_TAG)) {
619 g_string_append (sctx->rubybuf, "</i>");
620 }
621
622 break;
623 }
624 default:
625 break;
626 }
627 if (context_state->str[i] == state) {
628 g_string_append (sctx->buf, str->str);
629 g_string_free (str, TRUE);
630 g_string_truncate (context_state, i);
631 return;
632 }
633 }
634 if (state == CLEAR_TAG) {
635 g_string_append (sctx->buf, str->str);
636 g_string_truncate (context_state, 0);
637 }
638 g_string_free (str, TRUE);
639 }
640
641 static void
handle_start_sync(GstSamiContext * sctx,const gchar ** atts)642 handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
643 {
644 int i;
645
646 sami_context_pop_state (sctx, CLEAR_TAG);
647 if (atts != NULL) {
648 for (i = 0; (atts[i] != NULL); i += 2) {
649 const gchar *key, *value;
650
651 key = atts[i];
652 value = atts[i + 1];
653
654 if (!value)
655 continue;
656 if (!g_ascii_strcasecmp ("start", key)) {
657 /* Only set a new start time if we don't have text pending */
658 if (sctx->resultbuf->len == 0)
659 sctx->time1 = sctx->time2;
660
661 sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
662 sctx->time2 = MAX (sctx->time2, sctx->time1);
663 g_string_append (sctx->resultbuf, sctx->buf->str);
664 sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
665 g_string_truncate (sctx->buf, 0);
666 }
667 }
668 }
669 }
670
671 static void
handle_start_font(GstSamiContext * sctx,const gchar ** atts)672 handle_start_font (GstSamiContext * sctx, const gchar ** atts)
673 {
674 int i;
675
676 sami_context_pop_state (sctx, SPAN_TAG);
677 if (atts != NULL) {
678 g_string_append (sctx->buf, "<span");
679 for (i = 0; (atts[i] != NULL); i += 2) {
680 const gchar *key, *value;
681
682 key = atts[i];
683 value = atts[i + 1];
684
685 if (!value)
686 continue;
687 if (!g_ascii_strcasecmp ("color", key)) {
688 /*
689 * There are invalid color value in many
690 * sami files.
691 * It will fix hex color value that start without '#'
692 */
693 const gchar *sharp = "";
694 int len = strlen (value);
695
696 if (!(*value == '#' && len == 7)) {
697 gchar *r;
698
699 /* check if it looks like hex */
700 if (strtol ((const char *) value, &r, 16) >= 0 &&
701 ((gchar *) r == (value + 6) && len == 6)) {
702 sharp = "#";
703 }
704 }
705 /* some colours can be found in many sami files, but X RGB database
706 * doesn't contain a colour by this name, so map explicitly */
707 if (!g_ascii_strcasecmp ("aqua", value)) {
708 value = "#00ffff";
709 } else if (!g_ascii_strcasecmp ("crimson", value)) {
710 value = "#dc143c";
711 } else if (!g_ascii_strcasecmp ("fuchsia", value)) {
712 value = "#ff00ff";
713 } else if (!g_ascii_strcasecmp ("indigo", value)) {
714 value = "#4b0082";
715 } else if (!g_ascii_strcasecmp ("lime", value)) {
716 value = "#00ff00";
717 } else if (!g_ascii_strcasecmp ("olive", value)) {
718 value = "#808000";
719 } else if (!g_ascii_strcasecmp ("silver", value)) {
720 value = "#c0c0c0";
721 } else if (!g_ascii_strcasecmp ("teal", value)) {
722 value = "#008080";
723 }
724 g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
725 value);
726 } else if (!g_ascii_strcasecmp ("face", key)) {
727 g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
728 }
729 }
730 g_string_append_c (sctx->buf, '>');
731 sami_context_push_state (sctx, SPAN_TAG);
732 }
733 }
734
735 static void
handle_start_element(HtmlContext * ctx,const gchar * name,const char ** atts,gpointer user_data)736 handle_start_element (HtmlContext * ctx, const gchar * name,
737 const char **atts, gpointer user_data)
738 {
739 GstSamiContext *sctx = (GstSamiContext *) user_data;
740
741 GST_LOG ("name:%s", name);
742
743 if (!g_ascii_strcasecmp ("sync", name)) {
744 handle_start_sync (sctx, atts);
745 sctx->in_sync = TRUE;
746 } else if (!g_ascii_strcasecmp ("font", name)) {
747 handle_start_font (sctx, atts);
748 } else if (!g_ascii_strcasecmp ("ruby", name)) {
749 sami_context_push_state (sctx, RUBY_TAG);
750 } else if (!g_ascii_strcasecmp ("br", name)) {
751 g_string_append_c (sctx->buf, '\n');
752 /* FIXME: support for furigana/ruby once implemented in pango */
753 } else if (!g_ascii_strcasecmp ("rt", name)) {
754 if (has_tag (sctx->state, ITALIC_TAG)) {
755 g_string_append (sctx->rubybuf, "<i>");
756 }
757 g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
758 sami_context_push_state (sctx, RT_TAG);
759 } else if (!g_ascii_strcasecmp ("i", name)) {
760 g_string_append (sctx->buf, "<i>");
761 sami_context_push_state (sctx, ITALIC_TAG);
762 } else if (!g_ascii_strcasecmp ("p", name)) {
763 }
764 }
765
766 static void
handle_end_element(HtmlContext * ctx,const char * name,gpointer user_data)767 handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
768 {
769 GstSamiContext *sctx = (GstSamiContext *) user_data;
770
771 GST_LOG ("name:%s", name);
772
773 if (!g_ascii_strcasecmp ("sync", name)) {
774 sctx->in_sync = FALSE;
775 } else if ((!g_ascii_strcasecmp ("body", name)) ||
776 (!g_ascii_strcasecmp ("sami", name))) {
777 /* We will usually have one buffer left when the body is closed
778 * as we need the next sync to actually send it */
779 if (sctx->buf->len != 0) {
780 /* Only set a new start time if we don't have text pending */
781 if (sctx->resultbuf->len == 0)
782 sctx->time1 = sctx->time2;
783
784 sctx->time2 = GST_CLOCK_TIME_NONE;
785 g_string_append (sctx->resultbuf, sctx->buf->str);
786 sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
787 g_string_truncate (sctx->buf, 0);
788 }
789 } else if (!g_ascii_strcasecmp ("font", name)) {
790 sami_context_pop_state (sctx, SPAN_TAG);
791 } else if (!g_ascii_strcasecmp ("ruby", name)) {
792 sami_context_pop_state (sctx, RUBY_TAG);
793 } else if (!g_ascii_strcasecmp ("i", name)) {
794 sami_context_pop_state (sctx, ITALIC_TAG);
795 }
796 }
797
798 static void
handle_text(HtmlContext * ctx,const gchar * text,gsize text_len,gpointer user_data)799 handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
800 gpointer user_data)
801 {
802 GstSamiContext *sctx = (GstSamiContext *) user_data;
803
804 /* Skip everything except content of the sync elements */
805 if (!sctx->in_sync)
806 return;
807
808 if (has_tag (sctx->state, RT_TAG)) {
809 g_string_append_c (sctx->rubybuf, ' ');
810 g_string_append (sctx->rubybuf, text);
811 g_string_append_c (sctx->rubybuf, ' ');
812 } else {
813 g_string_append (sctx->buf, text);
814 }
815 }
816
817 static HtmlParser samiParser = {
818 handle_start_element, /* start_element */
819 handle_end_element, /* end_element */
820 handle_text, /* text */
821 };
822
823 void
sami_context_init(ParserState * state)824 sami_context_init (ParserState * state)
825 {
826 GstSamiContext *context;
827
828 g_assert (state->user_data == NULL);
829
830 context = g_new0 (GstSamiContext, 1);
831
832 context->htmlctxt = html_context_new (&samiParser, context);
833 context->buf = g_string_new ("");
834 context->rubybuf = g_string_new ("");
835 context->resultbuf = g_string_new ("");
836 context->state = g_string_new ("");
837
838 state->user_data = context;
839 }
840
841 void
sami_context_deinit(ParserState * state)842 sami_context_deinit (ParserState * state)
843 {
844 GstSamiContext *context = (GstSamiContext *) state->user_data;
845
846 if (context) {
847 html_context_free (context->htmlctxt);
848 context->htmlctxt = NULL;
849 g_string_free (context->buf, TRUE);
850 g_string_free (context->rubybuf, TRUE);
851 g_string_free (context->resultbuf, TRUE);
852 g_string_free (context->state, TRUE);
853 g_free (context);
854 state->user_data = NULL;
855 }
856 }
857
858 void
sami_context_reset(ParserState * state)859 sami_context_reset (ParserState * state)
860 {
861 GstSamiContext *context = (GstSamiContext *) state->user_data;
862
863 if (context) {
864 g_string_truncate (context->buf, 0);
865 g_string_truncate (context->rubybuf, 0);
866 g_string_truncate (context->resultbuf, 0);
867 g_string_truncate (context->state, 0);
868 context->has_result = FALSE;
869 context->in_sync = FALSE;
870 context->time1 = 0;
871 context->time2 = 0;
872 }
873 }
874
875 gchar *
parse_sami(ParserState * state,const gchar * line)876 parse_sami (ParserState * state, const gchar * line)
877 {
878 gchar *ret = NULL;
879 GstSamiContext *context = (GstSamiContext *) state->user_data;
880
881 gchar *unescaped = unescape_string (line);
882 html_context_parse (context->htmlctxt, (gchar *) unescaped,
883 strlen (unescaped));
884 g_free (unescaped);
885
886 if (context->has_result) {
887 if (context->rubybuf->len) {
888 context->rubybuf = g_string_append_c (context->rubybuf, '\n');
889 g_string_prepend (context->resultbuf, context->rubybuf->str);
890 context->rubybuf = g_string_truncate (context->rubybuf, 0);
891 }
892
893 ret = g_string_free (context->resultbuf, FALSE);
894 context->resultbuf = g_string_new ("");
895 state->start_time = context->time1;
896 state->duration = context->time2 - context->time1;
897 context->has_result = FALSE;
898 }
899 return ret;
900 }
901