1 /* GStreamer SAMI subtitle parser
2 * Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
13 *
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17 * Boston, MA 02110-1301, USA.
18 */
19
20 #include "samiparse.h"
21
22 #include <glib.h>
23 #include <string.h>
24 #include <stdlib.h>
25
26 #define ITALIC_TAG 'i'
27 #define SPAN_TAG 's'
28 #define RUBY_TAG 'r'
29 #define RT_TAG 't'
30 #define CLEAR_TAG '0'
31
32 typedef struct _HtmlParser HtmlParser;
33 typedef struct _HtmlContext HtmlContext;
34 typedef struct _GstSamiContext GstSamiContext;
35
36 struct _GstSamiContext
37 {
38 GString *buf; /* buffer to collect content */
39 GString *rubybuf; /* buffer to collect ruby content */
40 GString *resultbuf; /* when opening the next 'sync' tag, move
41 * from 'buf' to avoid to append following
42 * content */
43 GString *state; /* in many sami files there are tags that
44 * are not closed, so for each open tag the
45 * parser will append a tag flag here so
46 * that tags can be closed properly on
47 * 'sync' tags. See _context_push_state()
48 * and _context_pop_state(). */
49 HtmlContext *htmlctxt; /* html parser context */
50 gboolean has_result; /* set when ready to push out result */
51 gboolean in_sync; /* flag to avoid appending anything except the
52 * content of the sync elements to buf */
53 guint64 time1; /* previous start attribute in sync tag */
54 guint64 time2; /* current start attribute in sync tag */
55 };
56
57 struct _HtmlParser
58 {
59 void (*start_element) (HtmlContext * ctx,
60 const gchar * name, const gchar ** attr, gpointer user_data);
61 void (*end_element) (HtmlContext * ctx,
62 const gchar * name, gpointer user_data);
63 void (*text) (HtmlContext * ctx,
64 const gchar * text, gsize text_len, gpointer user_data);
65 };
66
67 struct _HtmlContext
68 {
69 const HtmlParser *parser;
70 gpointer user_data;
71 GString *buf;
72 };
73
74 static HtmlContext *
html_context_new(HtmlParser * parser,gpointer user_data)75 html_context_new (HtmlParser * parser, gpointer user_data)
76 {
77 HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
78 ctxt->parser = parser;
79 ctxt->user_data = user_data;
80 ctxt->buf = g_string_new (NULL);
81 return ctxt;
82 }
83
84 static void
html_context_free(HtmlContext * ctxt)85 html_context_free (HtmlContext * ctxt)
86 {
87 g_string_free (ctxt->buf, TRUE);
88 g_free (ctxt);
89 }
90
91 struct EntityMap
92 {
93 const gunichar unescaped;
94 const gchar *escaped;
95 };
96
97 struct EntityMap XmlEntities[] = {
98 {34, "quot;"},
99 {38, "amp;"},
100 {39, "apos;"},
101 {60, "lt;"},
102 {62, "gt;"},
103 {0, NULL},
104 };
105
106 struct EntityMap HtmlEntities[] = {
107 /* nbsp will handle manually
108 { 160, "nbsp;" }, */
109 {161, "iexcl;"},
110 {162, "cent;"},
111 {163, "pound;"},
112 {164, "curren;"},
113 {165, "yen;"},
114 {166, "brvbar;"},
115 {167, "sect;"},
116 {168, "uml;"},
117 {169, "copy;"},
118 {170, "ordf;"},
119 {171, "laquo;"},
120 {172, "not;"},
121 {173, "shy;"},
122 {174, "reg;"},
123 {175, "macr;"},
124 {176, "deg;"},
125 {177, "plusmn;"},
126 {178, "sup2;"},
127 {179, "sup3;"},
128 {180, "acute;"},
129 {181, "micro;"},
130 {182, "para;"},
131 {183, "middot;"},
132 {184, "cedil;"},
133 {185, "sup1;"},
134 {186, "ordm;"},
135 {187, "raquo;"},
136 {188, "frac14;"},
137 {189, "frac12;"},
138 {190, "frac34;"},
139 {191, "iquest;"},
140 {192, "Agrave;"},
141 {193, "Aacute;"},
142 {194, "Acirc;"},
143 {195, "Atilde;"},
144 {196, "Auml;"},
145 {197, "Aring;"},
146 {198, "AElig;"},
147 {199, "Ccedil;"},
148 {200, "Egrave;"},
149 {201, "Eacute;"},
150 {202, "Ecirc;"},
151 {203, "Euml;"},
152 {204, "Igrave;"},
153 {205, "Iacute;"},
154 {206, "Icirc;"},
155 {207, "Iuml;"},
156 {208, "ETH;"},
157 {209, "Ntilde;"},
158 {210, "Ograve;"},
159 {211, "Oacute;"},
160 {212, "Ocirc;"},
161 {213, "Otilde;"},
162 {214, "Ouml;"},
163 {215, "times;"},
164 {216, "Oslash;"},
165 {217, "Ugrave;"},
166 {218, "Uacute;"},
167 {219, "Ucirc;"},
168 {220, "Uuml;"},
169 {221, "Yacute;"},
170 {222, "THORN;"},
171 {223, "szlig;"},
172 {224, "agrave;"},
173 {225, "aacute;"},
174 {226, "acirc;"},
175 {227, "atilde;"},
176 {228, "auml;"},
177 {229, "aring;"},
178 {230, "aelig;"},
179 {231, "ccedil;"},
180 {232, "egrave;"},
181 {233, "eacute;"},
182 {234, "ecirc;"},
183 {235, "euml;"},
184 {236, "igrave;"},
185 {237, "iacute;"},
186 {238, "icirc;"},
187 {239, "iuml;"},
188 {240, "eth;"},
189 {241, "ntilde;"},
190 {242, "ograve;"},
191 {243, "oacute;"},
192 {244, "ocirc;"},
193 {245, "otilde;"},
194 {246, "ouml;"},
195 {247, "divide;"},
196 {248, "oslash;"},
197 {249, "ugrave;"},
198 {250, "uacute;"},
199 {251, "ucirc;"},
200 {252, "uuml;"},
201 {253, "yacute;"},
202 {254, "thorn;"},
203 {255, "yuml;"},
204 {338, "OElig;"},
205 {339, "oelig;"},
206 {352, "Scaron;"},
207 {353, "scaron;"},
208 {376, "Yuml;"},
209 {402, "fnof;"},
210 {710, "circ;"},
211 {732, "tilde;"},
212 {913, "Alpha;"},
213 {914, "Beta;"},
214 {915, "Gamma;"},
215 {916, "Delta;"},
216 {917, "Epsilon;"},
217 {918, "Zeta;"},
218 {919, "Eta;"},
219 {920, "Theta;"},
220 {921, "Iota;"},
221 {922, "Kappa;"},
222 {923, "Lambda;"},
223 {924, "Mu;"},
224 {925, "Nu;"},
225 {926, "Xi;"},
226 {927, "Omicron;"},
227 {928, "Pi;"},
228 {929, "Rho;"},
229 {931, "Sigma;"},
230 {932, "Tau;"},
231 {933, "Upsilon;"},
232 {934, "Phi;"},
233 {935, "Chi;"},
234 {936, "Psi;"},
235 {937, "Omega;"},
236 {945, "alpha;"},
237 {946, "beta;"},
238 {947, "gamma;"},
239 {948, "delta;"},
240 {949, "epsilon;"},
241 {950, "zeta;"},
242 {951, "eta;"},
243 {952, "theta;"},
244 {953, "iota;"},
245 {954, "kappa;"},
246 {955, "lambda;"},
247 {956, "mu;"},
248 {957, "nu;"},
249 {958, "xi;"},
250 {959, "omicron;"},
251 {960, "pi;"},
252 {961, "rho;"},
253 {962, "sigmaf;"},
254 {963, "sigma;"},
255 {964, "tau;"},
256 {965, "upsilon;"},
257 {966, "phi;"},
258 {967, "chi;"},
259 {968, "psi;"},
260 {969, "omega;"},
261 {977, "thetasym;"},
262 {978, "upsih;"},
263 {982, "piv;"},
264 {8194, "ensp;"},
265 {8195, "emsp;"},
266 {8201, "thinsp;"},
267 {8204, "zwnj;"},
268 {8205, "zwj;"},
269 {8206, "lrm;"},
270 {8207, "rlm;"},
271 {8211, "ndash;"},
272 {8212, "mdash;"},
273 {8216, "lsquo;"},
274 {8217, "rsquo;"},
275 {8218, "sbquo;"},
276 {8220, "ldquo;"},
277 {8221, "rdquo;"},
278 {8222, "bdquo;"},
279 {8224, "dagger;"},
280 {8225, "Dagger;"},
281 {8226, "bull;"},
282 {8230, "hellip;"},
283 {8240, "permil;"},
284 {8242, "prime;"},
285 {8243, "Prime;"},
286 {8249, "lsaquo;"},
287 {8250, "rsaquo;"},
288 {8254, "oline;"},
289 {8260, "frasl;"},
290 {8364, "euro;"},
291 {8465, "image;"},
292 {8472, "weierp;"},
293 {8476, "real;"},
294 {8482, "trade;"},
295 {8501, "alefsym;"},
296 {8592, "larr;"},
297 {8593, "uarr;"},
298 {8594, "rarr;"},
299 {8595, "darr;"},
300 {8596, "harr;"},
301 {8629, "crarr;"},
302 {8656, "lArr;"},
303 {8657, "uArr;"},
304 {8658, "rArr;"},
305 {8659, "dArr;"},
306 {8660, "hArr;"},
307 {8704, "forall;"},
308 {8706, "part;"},
309 {8707, "exist;"},
310 {8709, "empty;"},
311 {8711, "nabla;"},
312 {8712, "isin;"},
313 {8713, "notin;"},
314 {8715, "ni;"},
315 {8719, "prod;"},
316 {8721, "sum;"},
317 {8722, "minus;"},
318 {8727, "lowast;"},
319 {8730, "radic;"},
320 {8733, "prop;"},
321 {8734, "infin;"},
322 {8736, "ang;"},
323 {8743, "and;"},
324 {8744, "or;"},
325 {8745, "cap;"},
326 {8746, "cup;"},
327 {8747, "int;"},
328 {8756, "there4;"},
329 {8764, "sim;"},
330 {8773, "cong;"},
331 {8776, "asymp;"},
332 {8800, "ne;"},
333 {8801, "equiv;"},
334 {8804, "le;"},
335 {8805, "ge;"},
336 {8834, "sub;"},
337 {8835, "sup;"},
338 {8836, "nsub;"},
339 {8838, "sube;"},
340 {8839, "supe;"},
341 {8853, "oplus;"},
342 {8855, "otimes;"},
343 {8869, "perp;"},
344 {8901, "sdot;"},
345 {8968, "lceil;"},
346 {8969, "rceil;"},
347 {8970, "lfloor;"},
348 {8971, "rfloor;"},
349 {9001, "lang;"},
350 {9002, "rang;"},
351 {9674, "loz;"},
352 {9824, "spades;"},
353 {9827, "clubs;"},
354 {9829, "hearts;"},
355 {9830, "diams;"},
356 {0, NULL},
357 };
358
359 static gchar *
unescape_string(const gchar * text)360 unescape_string (const gchar * text)
361 {
362 gint i;
363 GString *unescaped = g_string_new (NULL);
364
365 while (*text) {
366 if (*text == '&') {
367 text++;
368
369 /* unescape   and */
370 if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
371 unescaped = g_string_append_unichar (unescaped, 160);
372 text += 4;
373 if (*text == ';') {
374 text++;
375 }
376 goto next;
377 }
378
379 /* pass xml entities. these will be processed as pango markup */
380 for (i = 0; XmlEntities[i].escaped; i++) {
381 gssize len = strlen (XmlEntities[i].escaped);
382 if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) {
383 unescaped = g_string_append_c (unescaped, '&');
384 unescaped =
385 g_string_append_len (unescaped, XmlEntities[i].escaped, len);
386 text += len;
387 goto next;
388 }
389 }
390
391 /* convert html entities */
392 for (i = 0; HtmlEntities[i].escaped; i++) {
393 gssize len = strlen (HtmlEntities[i].escaped);
394 if (!strncmp (text, HtmlEntities[i].escaped, len)) {
395 unescaped =
396 g_string_append_unichar (unescaped, HtmlEntities[i].unescaped);
397 text += len;
398 goto next;
399 }
400 }
401
402 if (*text == '#') {
403 gboolean is_hex = FALSE;
404 gunichar l;
405 gchar *end = NULL;
406
407 text++;
408 if (*text == 'x') {
409 is_hex = TRUE;
410 text++;
411 }
412 errno = 0;
413 if (is_hex) {
414 l = strtoul (text, &end, 16);
415 } else {
416 l = strtoul (text, &end, 10);
417 }
418
419 if (text == end || errno != 0) {
420 /* error occurred. pass it */
421 goto next;
422 }
423 unescaped = g_string_append_unichar (unescaped, l);
424 text = end;
425
426 if (*text == ';') {
427 text++;
428 }
429 goto next;
430 }
431
432 /* escape & */
433 unescaped = g_string_append (unescaped, "&");
434
435 next:
436 continue;
437
438 } else if (g_ascii_isspace (*text)) {
439 unescaped = g_string_append_c (unescaped, ' ');
440 /* strip whitespace */
441 do {
442 text++;
443 } while ((*text) && g_ascii_isspace (*text));
444 } else {
445 unescaped = g_string_append_c (unescaped, *text);
446 text++;
447 }
448 }
449
450 return g_string_free (unescaped, FALSE);
451 }
452
453 static const gchar *
string_token(const gchar * string,const gchar * delimiter,gchar ** first)454 string_token (const gchar * string, const gchar * delimiter, gchar ** first)
455 {
456 gchar *next = strstr (string, delimiter);
457 if (next) {
458 *first = g_strndup (string, next - string);
459 } else {
460 *first = g_strdup (string);
461 }
462 return next;
463 }
464
465 static void
html_context_handle_element(HtmlContext * ctxt,const gchar * string,gboolean must_close)466 html_context_handle_element (HtmlContext * ctxt,
467 const gchar * string, gboolean must_close)
468 {
469 gchar *name = NULL;
470 gint count = 0, i;
471 gchar **attrs;
472 const gchar *found, *next;
473
474 /* split element name and attributes */
475 next = string_token (string, " ", &name);
476
477 if (next) {
478 /* count attributes */
479 found = next + 1;
480 while (TRUE) {
481 found = strchr (found, '=');
482 if (!found)
483 break;
484 found++;
485 count++;
486 }
487 } else {
488 count = 0;
489 }
490
491 attrs = g_new0 (gchar *, (count + 1) * 2);
492
493 for (i = 0; i < count && next != NULL; i += 2) {
494 gchar *attr_name = NULL, *attr_value = NULL;
495 gsize length;
496 next = string_token (next + 1, "=", &attr_name);
497 if (!next) {
498 g_free (attr_name);
499 break;
500 }
501 next = string_token (next + 1, " ", &attr_value);
502
503 /* strip " or ' from attribute value */
504 if (attr_value[0] == '"' || attr_value[0] == '\'') {
505 gchar *tmp = g_strdup (attr_value + 1);
506 g_free (attr_value);
507 attr_value = tmp;
508 }
509
510 length = strlen (attr_value);
511 if (length > 0 && (attr_value[length - 1] == '"'
512 || attr_value[length - 1] == '\'')) {
513 attr_value[length - 1] = '\0';
514 }
515
516 attrs[i] = attr_name;
517 attrs[i + 1] = attr_value;
518 }
519
520 ctxt->parser->start_element (ctxt, name,
521 (const gchar **) attrs, ctxt->user_data);
522 if (must_close) {
523 ctxt->parser->end_element (ctxt, name, ctxt->user_data);
524 }
525 g_strfreev (attrs);
526 g_free (name);
527 }
528
529 static void
html_context_parse(HtmlContext * ctxt,gchar * text,gsize text_len)530 html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
531 {
532 const gchar *next = NULL;
533 ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
534 next = ctxt->buf->str;
535 while (TRUE) {
536 if (next[0] == '<') {
537 gchar *element = NULL;
538 /* find <blahblah> */
539 if (!strchr (next, '>')) {
540 /* no tag end point. buffer will be process in next time */
541 return;
542 }
543
544 next = string_token (next, ">", &element);
545 next++;
546 if (g_str_has_suffix (next, "/")) {
547 /* handle <blah/> */
548 element[strlen (element) - 1] = '\0';
549 html_context_handle_element (ctxt, element + 1, TRUE);
550 } else if (element[1] == '/') {
551 /* handle </blah> */
552 ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
553 } else {
554 /* handle <blah> */
555 html_context_handle_element (ctxt, element + 1, FALSE);
556 }
557 g_free (element);
558 } else if (strchr (next, '<')) {
559 gchar *text = NULL;
560 gsize length;
561 next = string_token (next, "<", &text);
562 text = g_strstrip (text);
563 length = strlen (text);
564 ctxt->parser->text (ctxt, text, length, ctxt->user_data);
565 g_free (text);
566
567 } else {
568 gchar *text = (gchar *) next;
569 gsize length;
570 text = g_strstrip (text);
571 length = strlen (text);
572 ctxt->parser->text (ctxt, text, length, ctxt->user_data);
573 ctxt->buf = g_string_assign (ctxt->buf, "");
574 return;
575 }
576 }
577
578 ctxt->buf = g_string_assign (ctxt->buf, next);
579 }
580
581 static gchar *
has_tag(GString * str,const gchar tag)582 has_tag (GString * str, const gchar tag)
583 {
584 return strrchr (str->str, tag);
585 }
586
587 static void
sami_context_push_state(GstSamiContext * sctx,char state)588 sami_context_push_state (GstSamiContext * sctx, char state)
589 {
590 GST_LOG ("state %c", state);
591 g_string_append_c (sctx->state, state);
592 }
593
594 static void
sami_context_pop_state(GstSamiContext * sctx,char state)595 sami_context_pop_state (GstSamiContext * sctx, char state)
596 {
597 GString *str = g_string_new ("");
598 GString *context_state = sctx->state;
599 int i;
600
601 GST_LOG ("state %c", state);
602 for (i = context_state->len - 1; i >= 0; i--) {
603 switch (context_state->str[i]) {
604 case ITALIC_TAG: /* <i> */
605 {
606 g_string_append (str, "</i>");
607 break;
608 }
609 case SPAN_TAG: /* <span foreground= > */
610 {
611 g_string_append (str, "</span>");
612 break;
613 }
614 case RUBY_TAG: /* <span size= > -- ruby */
615 {
616 break;
617 }
618 case RT_TAG: /* ruby */
619 {
620 /* FIXME: support for furigana/ruby once implemented in pango */
621 g_string_append (sctx->rubybuf, "</span>");
622 if (has_tag (context_state, ITALIC_TAG)) {
623 g_string_append (sctx->rubybuf, "</i>");
624 }
625
626 break;
627 }
628 default:
629 break;
630 }
631 if (context_state->str[i] == state) {
632 g_string_append (sctx->buf, str->str);
633 g_string_free (str, TRUE);
634 g_string_truncate (context_state, i);
635 return;
636 }
637 }
638 if (state == CLEAR_TAG) {
639 g_string_append (sctx->buf, str->str);
640 g_string_truncate (context_state, 0);
641 }
642 g_string_free (str, TRUE);
643 }
644
645 static void
handle_start_sync(GstSamiContext * sctx,const gchar ** atts)646 handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
647 {
648 int i;
649
650 sami_context_pop_state (sctx, CLEAR_TAG);
651 if (atts != NULL) {
652 for (i = 0; (atts[i] != NULL); i += 2) {
653 const gchar *key, *value;
654
655 key = atts[i];
656 value = atts[i + 1];
657
658 if (!value)
659 continue;
660 if (!g_ascii_strcasecmp ("start", key)) {
661 /* Only set a new start time if we don't have text pending */
662 if (sctx->resultbuf->len == 0)
663 sctx->time1 = sctx->time2;
664
665 sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
666 sctx->time2 = MAX (sctx->time2, sctx->time1);
667 g_string_append (sctx->resultbuf, sctx->buf->str);
668 sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
669 g_string_truncate (sctx->buf, 0);
670 }
671 }
672 }
673 }
674
675 static void
handle_start_font(GstSamiContext * sctx,const gchar ** atts)676 handle_start_font (GstSamiContext * sctx, const gchar ** atts)
677 {
678 int i;
679
680 sami_context_pop_state (sctx, SPAN_TAG);
681 if (atts != NULL) {
682 g_string_append (sctx->buf, "<span");
683 for (i = 0; (atts[i] != NULL); i += 2) {
684 const gchar *key, *value;
685
686 key = atts[i];
687 value = atts[i + 1];
688
689 if (!value)
690 continue;
691 if (!g_ascii_strcasecmp ("color", key)) {
692 /*
693 * There are invalid color value in many
694 * sami files.
695 * It will fix hex color value that start without '#'
696 */
697 const gchar *sharp = "";
698 int len = strlen (value);
699
700 if (!(*value == '#' && len == 7)) {
701 gchar *r;
702
703 /* check if it looks like hex */
704 if (strtol ((const char *) value, &r, 16) >= 0 &&
705 ((gchar *) r == (value + 6) && len == 6)) {
706 sharp = "#";
707 }
708 }
709 /* some colours can be found in many sami files, but X RGB database
710 * doesn't contain a colour by this name, so map explicitly */
711 if (!g_ascii_strcasecmp ("aqua", value)) {
712 value = "#00ffff";
713 } else if (!g_ascii_strcasecmp ("crimson", value)) {
714 value = "#dc143c";
715 } else if (!g_ascii_strcasecmp ("fuchsia", value)) {
716 value = "#ff00ff";
717 } else if (!g_ascii_strcasecmp ("indigo", value)) {
718 value = "#4b0082";
719 } else if (!g_ascii_strcasecmp ("lime", value)) {
720 value = "#00ff00";
721 } else if (!g_ascii_strcasecmp ("olive", value)) {
722 value = "#808000";
723 } else if (!g_ascii_strcasecmp ("silver", value)) {
724 value = "#c0c0c0";
725 } else if (!g_ascii_strcasecmp ("teal", value)) {
726 value = "#008080";
727 }
728 g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
729 value);
730 } else if (!g_ascii_strcasecmp ("face", key)) {
731 g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
732 }
733 }
734 g_string_append_c (sctx->buf, '>');
735 sami_context_push_state (sctx, SPAN_TAG);
736 }
737 }
738
739 static void
handle_start_element(HtmlContext * ctx,const gchar * name,const char ** atts,gpointer user_data)740 handle_start_element (HtmlContext * ctx, const gchar * name,
741 const char **atts, gpointer user_data)
742 {
743 GstSamiContext *sctx = (GstSamiContext *) user_data;
744
745 GST_LOG ("name:%s", name);
746
747 if (!g_ascii_strcasecmp ("sync", name)) {
748 handle_start_sync (sctx, atts);
749 sctx->in_sync = TRUE;
750 } else if (!g_ascii_strcasecmp ("font", name)) {
751 handle_start_font (sctx, atts);
752 } else if (!g_ascii_strcasecmp ("ruby", name)) {
753 sami_context_push_state (sctx, RUBY_TAG);
754 } else if (!g_ascii_strcasecmp ("br", name)) {
755 g_string_append_c (sctx->buf, '\n');
756 /* FIXME: support for furigana/ruby once implemented in pango */
757 } else if (!g_ascii_strcasecmp ("rt", name)) {
758 if (has_tag (sctx->state, ITALIC_TAG)) {
759 g_string_append (sctx->rubybuf, "<i>");
760 }
761 g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
762 sami_context_push_state (sctx, RT_TAG);
763 } else if (!g_ascii_strcasecmp ("i", name)) {
764 g_string_append (sctx->buf, "<i>");
765 sami_context_push_state (sctx, ITALIC_TAG);
766 } else if (!g_ascii_strcasecmp ("p", name)) {
767 }
768 }
769
770 static void
handle_end_element(HtmlContext * ctx,const char * name,gpointer user_data)771 handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
772 {
773 GstSamiContext *sctx = (GstSamiContext *) user_data;
774
775 GST_LOG ("name:%s", name);
776
777 if (!g_ascii_strcasecmp ("sync", name)) {
778 sctx->in_sync = FALSE;
779 } else if ((!g_ascii_strcasecmp ("body", name)) ||
780 (!g_ascii_strcasecmp ("sami", name))) {
781 /* We will usually have one buffer left when the body is closed
782 * as we need the next sync to actually send it */
783 if (sctx->buf->len != 0) {
784 /* Only set a new start time if we don't have text pending */
785 if (sctx->resultbuf->len == 0)
786 sctx->time1 = sctx->time2;
787
788 sctx->time2 = GST_CLOCK_TIME_NONE;
789 g_string_append (sctx->resultbuf, sctx->buf->str);
790 sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
791 g_string_truncate (sctx->buf, 0);
792 }
793 } else if (!g_ascii_strcasecmp ("font", name)) {
794 sami_context_pop_state (sctx, SPAN_TAG);
795 } else if (!g_ascii_strcasecmp ("ruby", name)) {
796 sami_context_pop_state (sctx, RUBY_TAG);
797 } else if (!g_ascii_strcasecmp ("i", name)) {
798 sami_context_pop_state (sctx, ITALIC_TAG);
799 }
800 }
801
802 static void
handle_text(HtmlContext * ctx,const gchar * text,gsize text_len,gpointer user_data)803 handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
804 gpointer user_data)
805 {
806 GstSamiContext *sctx = (GstSamiContext *) user_data;
807
808 /* Skip everything except content of the sync elements */
809 if (!sctx->in_sync)
810 return;
811
812 if (has_tag (sctx->state, RT_TAG)) {
813 g_string_append_c (sctx->rubybuf, ' ');
814 g_string_append (sctx->rubybuf, text);
815 g_string_append_c (sctx->rubybuf, ' ');
816 } else {
817 g_string_append (sctx->buf, text);
818 }
819 }
820
821 static HtmlParser samiParser = {
822 handle_start_element, /* start_element */
823 handle_end_element, /* end_element */
824 handle_text, /* text */
825 };
826
827 void
sami_context_init(ParserState * state)828 sami_context_init (ParserState * state)
829 {
830 GstSamiContext *context;
831
832 g_assert (state->user_data == NULL);
833
834 context = g_new0 (GstSamiContext, 1);
835
836 context->htmlctxt = html_context_new (&samiParser, context);
837 context->buf = g_string_new ("");
838 context->rubybuf = g_string_new ("");
839 context->resultbuf = g_string_new ("");
840 context->state = g_string_new ("");
841
842 state->user_data = context;
843 }
844
845 void
sami_context_deinit(ParserState * state)846 sami_context_deinit (ParserState * state)
847 {
848 GstSamiContext *context = (GstSamiContext *) state->user_data;
849
850 if (context) {
851 html_context_free (context->htmlctxt);
852 context->htmlctxt = NULL;
853 g_string_free (context->buf, TRUE);
854 g_string_free (context->rubybuf, TRUE);
855 g_string_free (context->resultbuf, TRUE);
856 g_string_free (context->state, TRUE);
857 g_free (context);
858 state->user_data = NULL;
859 }
860 }
861
862 void
sami_context_reset(ParserState * state)863 sami_context_reset (ParserState * state)
864 {
865 GstSamiContext *context = (GstSamiContext *) state->user_data;
866
867 if (context) {
868 g_string_truncate (context->buf, 0);
869 g_string_truncate (context->rubybuf, 0);
870 g_string_truncate (context->resultbuf, 0);
871 g_string_truncate (context->state, 0);
872 context->has_result = FALSE;
873 context->in_sync = FALSE;
874 context->time1 = 0;
875 context->time2 = 0;
876 }
877 }
878
879 gchar *
parse_sami(ParserState * state,const gchar * line)880 parse_sami (ParserState * state, const gchar * line)
881 {
882 gchar *ret = NULL;
883 GstSamiContext *context = (GstSamiContext *) state->user_data;
884
885 gchar *unescaped = unescape_string (line);
886 html_context_parse (context->htmlctxt, (gchar *) unescaped,
887 strlen (unescaped));
888 g_free (unescaped);
889
890 if (context->has_result) {
891 if (context->rubybuf->len) {
892 context->rubybuf = g_string_append_c (context->rubybuf, '\n');
893 g_string_prepend (context->resultbuf, context->rubybuf->str);
894 context->rubybuf = g_string_truncate (context->rubybuf, 0);
895 }
896
897 ret = g_string_free (context->resultbuf, FALSE);
898 context->resultbuf = g_string_new ("");
899 state->start_time = context->time1;
900 state->duration = context->time2 - context->time1;
901 context->has_result = FALSE;
902 }
903 return ret;
904 }
905