• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GStreamer
2  * Copyright (C) 2020 Huawei Technologies Co., Ltd.
3  *   @Author: Stéphane Cerveau <scerveau@collabora.com>
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public
16  * License along with this library; if not, write to the Free
17  * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config.h"
22 #endif
23 
24 #include <stdio.h>
25 
26 #include "gstsubparseelements.h"
27 
28 GST_DEBUG_CATEGORY (sub_parse_debug);
29 
30 /* regex type enum */
31 typedef enum
32 {
33   GST_SUB_PARSE_REGEX_UNKNOWN = 0,
34   GST_SUB_PARSE_REGEX_MDVDSUB = 1,
35   GST_SUB_PARSE_REGEX_SUBRIP = 2,
36   GST_SUB_PARSE_REGEX_DKS = 3,
37   GST_SUB_PARSE_REGEX_VTT = 4,
38 } GstSubParseRegex;
39 
40 static gpointer
gst_sub_parse_data_format_autodetect_regex_once(GstSubParseRegex regtype)41 gst_sub_parse_data_format_autodetect_regex_once (GstSubParseRegex regtype)
42 {
43   gpointer result = NULL;
44   GError *gerr = NULL;
45   switch (regtype) {
46     case GST_SUB_PARSE_REGEX_MDVDSUB:
47       result =
48           (gpointer) g_regex_new ("^\\{[0-9]+\\}\\{[0-9]+\\}",
49           G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr);
50       if (result == NULL) {
51         g_warning ("Compilation of mdvd regex failed: %s", gerr->message);
52         g_clear_error (&gerr);
53       }
54       break;
55     case GST_SUB_PARSE_REGEX_SUBRIP:
56       result = (gpointer)
57           g_regex_new ("^[\\s\\n]*[\\n]? {0,3}[ 0-9]{1,4}\\s*(\x0d)?\x0a"
58           " ?[0-9]{1,2}: ?[0-9]{1,2}: ?[0-9]{1,2}[,.] {0,2}[0-9]{1,3}"
59           " +--> +[0-9]{1,2}: ?[0-9]{1,2}: ?[0-9]{1,2}[,.] {0,2}[0-9]{1,2}",
60           G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr);
61       if (result == NULL) {
62         g_warning ("Compilation of subrip regex failed: %s", gerr->message);
63         g_clear_error (&gerr);
64       }
65       break;
66     case GST_SUB_PARSE_REGEX_DKS:
67       result = (gpointer) g_regex_new ("^\\[[0-9]+:[0-9]+:[0-9]+\\].*",
68           G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr);
69       if (result == NULL) {
70         g_warning ("Compilation of dks regex failed: %s", gerr->message);
71         g_clear_error (&gerr);
72       }
73       break;
74     case GST_SUB_PARSE_REGEX_VTT:
75       result = (gpointer)
76           g_regex_new ("^(\\xef\\xbb\\xbf)?WEBVTT[\\xa\\xd\\x20\\x9]", 0, 0,
77           &gerr);
78       if (result == NULL) {
79         g_warning ("Compilation of vtt regex failed: %s", gerr->message);
80         g_error_free (gerr);
81       }
82       break;
83 
84     default:
85       GST_WARNING ("Trying to allocate regex of unknown type %u", regtype);
86   }
87   return result;
88 }
89 
90 /*
91  * FIXME: maybe we should pass along a second argument, the preceding
92  * text buffer, because that is how this originally worked, even though
93  * I don't really see the use of that.
94  */
95 
96 GstSubParseFormat
gst_sub_parse_data_format_autodetect(gchar * match_str)97 gst_sub_parse_data_format_autodetect (gchar * match_str)
98 {
99   guint n1, n2, n3;
100 
101   static GOnce mdvd_rx_once = G_ONCE_INIT;
102   static GOnce subrip_rx_once = G_ONCE_INIT;
103   static GOnce dks_rx_once = G_ONCE_INIT;
104   static GOnce vtt_rx_once = G_ONCE_INIT;
105 
106   GRegex *mdvd_grx;
107   GRegex *subrip_grx;
108   GRegex *dks_grx;
109   GRegex *vtt_grx;
110 
111   g_once (&mdvd_rx_once,
112       (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
113       (gpointer) GST_SUB_PARSE_REGEX_MDVDSUB);
114   g_once (&subrip_rx_once,
115       (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
116       (gpointer) GST_SUB_PARSE_REGEX_SUBRIP);
117   g_once (&dks_rx_once,
118       (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
119       (gpointer) GST_SUB_PARSE_REGEX_DKS);
120   g_once (&vtt_rx_once,
121       (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
122       (gpointer) GST_SUB_PARSE_REGEX_VTT);
123 
124   mdvd_grx = (GRegex *) mdvd_rx_once.retval;
125   subrip_grx = (GRegex *) subrip_rx_once.retval;
126   dks_grx = (GRegex *) dks_rx_once.retval;
127   vtt_grx = (GRegex *) vtt_rx_once.retval;
128 
129   if (g_regex_match (mdvd_grx, match_str, 0, NULL)) {
130     GST_LOG ("MicroDVD (frame based) format detected");
131     return GST_SUB_PARSE_FORMAT_MDVDSUB;
132   }
133   if (g_regex_match (subrip_grx, match_str, 0, NULL)) {
134     GST_LOG ("SubRip (time based) format detected");
135     return GST_SUB_PARSE_FORMAT_SUBRIP;
136   }
137   if (g_regex_match (dks_grx, match_str, 0, NULL)) {
138     GST_LOG ("DKS (time based) format detected");
139     return GST_SUB_PARSE_FORMAT_DKS;
140   }
141   if (g_regex_match (vtt_grx, match_str, 0, NULL) == TRUE) {
142     GST_LOG ("WebVTT (time based) format detected");
143     return GST_SUB_PARSE_FORMAT_VTT;
144   }
145 
146   if (!strncmp (match_str, "FORMAT=TIME", 11)) {
147     GST_LOG ("MPSub (time based) format detected");
148     return GST_SUB_PARSE_FORMAT_MPSUB;
149   }
150   if (strstr (match_str, "<SAMI>") != NULL ||
151       strstr (match_str, "<sami>") != NULL) {
152     GST_LOG ("SAMI (time based) format detected");
153     return GST_SUB_PARSE_FORMAT_SAMI;
154   }
155   /* we're boldly assuming the first subtitle appears within the first hour */
156   if (sscanf (match_str, "0:%02u:%02u:", &n1, &n2) == 2 ||
157       sscanf (match_str, "0:%02u:%02u=", &n1, &n2) == 2 ||
158       sscanf (match_str, "00:%02u:%02u:", &n1, &n2) == 2 ||
159       sscanf (match_str, "00:%02u:%02u=", &n1, &n2) == 2 ||
160       sscanf (match_str, "00:%02u:%02u,%u=", &n1, &n2, &n3) == 3) {
161     GST_LOG ("TMPlayer (time based) format detected");
162     return GST_SUB_PARSE_FORMAT_TMPLAYER;
163   }
164   if (sscanf (match_str, "[%u][%u]", &n1, &n2) == 2) {
165     GST_LOG ("MPL2 (time based) format detected");
166     return GST_SUB_PARSE_FORMAT_MPL2;
167   }
168   if (strstr (match_str, "[INFORMATION]") != NULL) {
169     GST_LOG ("SubViewer (time based) format detected");
170     return GST_SUB_PARSE_FORMAT_SUBVIEWER;
171   }
172   if (strstr (match_str, "{QTtext}") != NULL) {
173     GST_LOG ("QTtext (time based) format detected");
174     return GST_SUB_PARSE_FORMAT_QTTEXT;
175   }
176   /* We assume the LRC file starts immediately */
177   if (match_str[0] == '[') {
178     gboolean all_lines_good = TRUE;
179     gchar **split;
180     gchar **ptr;
181 
182     ptr = split = g_strsplit (match_str, "\n", -1);
183     while (*ptr && *(ptr + 1)) {
184       gchar *str = *ptr;
185       gint len = strlen (str);
186 
187       if (sscanf (str, "[%u:%02u.%02u]", &n1, &n2, &n3) == 3 ||
188           sscanf (str, "[%u:%02u.%03u]", &n1, &n2, &n3) == 3) {
189         all_lines_good = TRUE;
190       } else if (len > 0 && str[len - 1] == ']' && strchr (str, ':') != NULL) {
191         all_lines_good = TRUE;
192       } else {
193         all_lines_good = FALSE;
194         break;
195       }
196 
197       ptr++;
198     }
199     g_strfreev (split);
200 
201     if (all_lines_good)
202       return GST_SUB_PARSE_FORMAT_LRC;
203   }
204 
205   GST_DEBUG ("no subtitle format detected");
206   return GST_SUB_PARSE_FORMAT_UNKNOWN;
207 }
208 
209 gchar *
gst_sub_parse_gst_convert_to_utf8(const gchar * str,gsize len,const gchar * encoding,gsize * consumed,GError ** err)210 gst_sub_parse_gst_convert_to_utf8 (const gchar * str, gsize len,
211     const gchar * encoding, gsize * consumed, GError ** err)
212 {
213   gchar *ret = NULL;
214 
215   *consumed = 0;
216   /* The char cast is necessary in glib < 2.24 */
217   ret =
218       g_convert_with_fallback (str, len, "UTF-8", encoding, (char *) "*",
219       consumed, NULL, err);
220   if (ret == NULL)
221     return ret;
222 
223   /* + 3 to skip UTF-8 BOM if it was added */
224   len = strlen (ret);
225   if (len >= 3 && (guint8) ret[0] == 0xEF && (guint8) ret[1] == 0xBB
226       && (guint8) ret[2] == 0xBF)
227     memmove (ret, ret + 3, len + 1 - 3);
228 
229   return ret;
230 }
231 
232 gchar *
gst_sub_parse_detect_encoding(const gchar * str,gsize len)233 gst_sub_parse_detect_encoding (const gchar * str, gsize len)
234 {
235   if (len >= 3 && (guint8) str[0] == 0xEF && (guint8) str[1] == 0xBB
236       && (guint8) str[2] == 0xBF)
237     return g_strdup ("UTF-8");
238 
239   if (len >= 2 && (guint8) str[0] == 0xFE && (guint8) str[1] == 0xFF)
240     return g_strdup ("UTF-16BE");
241 
242   if (len >= 2 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE)
243     return g_strdup ("UTF-16LE");
244 
245   if (len >= 4 && (guint8) str[0] == 0x00 && (guint8) str[1] == 0x00
246       && (guint8) str[2] == 0xFE && (guint8) str[3] == 0xFF)
247     return g_strdup ("UTF-32BE");
248 
249   if (len >= 4 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE
250       && (guint8) str[2] == 0x00 && (guint8) str[3] == 0x00)
251     return g_strdup ("UTF-32LE");
252 
253   return NULL;
254 }
255 
256 /*
257  * Typefind support.
258  */
259 
260 /* FIXME 0.11: these caps are ugly, use app/x-subtitle + type field or so;
261  * also, give different  subtitle formats really different types */
262 static GstStaticCaps mpl2_caps =
263 GST_STATIC_CAPS ("application/x-subtitle-mpl2");
264 #define SUB_CAPS (gst_static_caps_get (&sub_caps))
265 
266 static GstStaticCaps tmp_caps =
267 GST_STATIC_CAPS ("application/x-subtitle-tmplayer");
268 #define TMP_CAPS (gst_static_caps_get (&tmp_caps))
269 
270 static GstStaticCaps sub_caps = GST_STATIC_CAPS ("application/x-subtitle");
271 #define MPL2_CAPS (gst_static_caps_get (&mpl2_caps))
272 
273 static GstStaticCaps smi_caps = GST_STATIC_CAPS ("application/x-subtitle-sami");
274 #define SAMI_CAPS (gst_static_caps_get (&smi_caps))
275 
276 static GstStaticCaps dks_caps = GST_STATIC_CAPS ("application/x-subtitle-dks");
277 #define DKS_CAPS (gst_static_caps_get (&dks_caps))
278 
279 static GstStaticCaps vtt_caps = GST_STATIC_CAPS ("application/x-subtitle-vtt");
280 #define VTT_CAPS (gst_static_caps_get (&vtt_caps))
281 
282 static GstStaticCaps qttext_caps =
283 GST_STATIC_CAPS ("application/x-subtitle-qttext");
284 #define QTTEXT_CAPS (gst_static_caps_get (&qttext_caps))
285 
286 static GstStaticCaps lrc_caps = GST_STATIC_CAPS ("application/x-subtitle-lrc");
287 #define LRC_CAPS (gst_static_caps_get (&lrc_caps))
288 
289 static void
gst_sub_parse_type_find(GstTypeFind * tf,gpointer private)290 gst_sub_parse_type_find (GstTypeFind * tf, gpointer private)
291 {
292   GstSubParseFormat format;
293   const guint8 *data;
294   GstCaps *caps;
295   gchar *str;
296   gchar *encoding = NULL;
297   const gchar *end;
298 
299   if (!(data = gst_type_find_peek (tf, 0, 129)))
300     return;
301 
302   /* make sure string passed to _autodetect() is NUL-terminated */
303   str = g_malloc0 (129);
304   memcpy (str, data, 128);
305 
306   if ((encoding = gst_sub_parse_detect_encoding (str, 128)) != NULL) {
307     gchar *converted_str;
308     GError *err = NULL;
309     gsize tmp;
310 
311     converted_str =
312         gst_sub_parse_gst_convert_to_utf8 (str, 128, encoding, &tmp, &err);
313     if (converted_str == NULL) {
314       GST_DEBUG ("Encoding '%s' detected but conversion failed: %s", encoding,
315           err->message);
316       g_clear_error (&err);
317     } else {
318       g_free (str);
319       str = converted_str;
320     }
321     g_free (encoding);
322   }
323 
324   /* Check if at least the first 120 chars are valid UTF8,
325    * otherwise convert as always */
326   if (!g_utf8_validate (str, 128, &end) && (end - str) < 120) {
327     gchar *converted_str;
328     gsize tmp;
329     const gchar *enc;
330 
331     enc = g_getenv ("GST_SUBTITLE_ENCODING");
332     if (enc == NULL || *enc == '\0') {
333       /* if local encoding is UTF-8 and no encoding specified
334        * via the environment variable, assume ISO-8859-15 */
335       if (g_get_charset (&enc)) {
336         enc = "ISO-8859-15";
337       }
338     }
339     converted_str =
340         gst_sub_parse_gst_convert_to_utf8 (str, 128, enc, &tmp, NULL);
341     if (converted_str != NULL) {
342       g_free (str);
343       str = converted_str;
344     }
345   }
346 
347   format = gst_sub_parse_data_format_autodetect (str);
348   g_free (str);
349 
350   switch (format) {
351     case GST_SUB_PARSE_FORMAT_MDVDSUB:
352       GST_DEBUG ("MicroDVD format detected");
353       caps = SUB_CAPS;
354       break;
355     case GST_SUB_PARSE_FORMAT_SUBRIP:
356       GST_DEBUG ("SubRip format detected");
357       caps = SUB_CAPS;
358       break;
359     case GST_SUB_PARSE_FORMAT_MPSUB:
360       GST_DEBUG ("MPSub format detected");
361       caps = SUB_CAPS;
362       break;
363     case GST_SUB_PARSE_FORMAT_SAMI:
364       GST_DEBUG ("SAMI (time-based) format detected");
365       caps = SAMI_CAPS;
366       break;
367     case GST_SUB_PARSE_FORMAT_TMPLAYER:
368       GST_DEBUG ("TMPlayer (time based) format detected");
369       caps = TMP_CAPS;
370       break;
371       /* FIXME: our MPL2 typefinding is not really good enough to warrant
372        * returning a high probability (however, since we registered our
373        * typefinder here with a rank of MARGINAL we should pretty much only
374        * be called if most other typefinders have already run */
375     case GST_SUB_PARSE_FORMAT_MPL2:
376       GST_DEBUG ("MPL2 (time based) format detected");
377       caps = MPL2_CAPS;
378       break;
379     case GST_SUB_PARSE_FORMAT_SUBVIEWER:
380       GST_DEBUG ("SubViewer format detected");
381       caps = SUB_CAPS;
382       break;
383     case GST_SUB_PARSE_FORMAT_DKS:
384       GST_DEBUG ("DKS format detected");
385       caps = DKS_CAPS;
386       break;
387     case GST_SUB_PARSE_FORMAT_QTTEXT:
388       GST_DEBUG ("QTtext format detected");
389       caps = QTTEXT_CAPS;
390       break;
391     case GST_SUB_PARSE_FORMAT_LRC:
392       GST_DEBUG ("LRC format detected");
393       caps = LRC_CAPS;
394       break;
395     case GST_SUB_PARSE_FORMAT_VTT:
396       GST_DEBUG ("WebVTT format detected");
397       caps = VTT_CAPS;
398       break;
399     default:
400     case GST_SUB_PARSE_FORMAT_UNKNOWN:
401       GST_DEBUG ("no subtitle format detected");
402       return;
403   }
404 
405   /* if we're here, it's ok */
406   gst_type_find_suggest (tf, GST_TYPE_FIND_MAXIMUM, caps);
407 }
408 
409 GST_TYPE_FIND_REGISTER_DEFINE (subparse, "subparse_typefind", GST_RANK_MARGINAL,
410     gst_sub_parse_type_find, "srt,sub,mpsub,mdvd,smi,txt,dks,vtt", SUB_CAPS,
411     NULL, NULL)
412 
413     gboolean
sub_parse_element_init(GstPlugin * plugin)414 sub_parse_element_init (GstPlugin * plugin)
415 {
416   static gsize res = FALSE;
417   gboolean ret = TRUE;
418   if (g_once_init_enter (&res)) {
419     GST_DEBUG_CATEGORY_INIT (sub_parse_debug, "subparse", 0, ".sub parser");
420 
421     ret |= GST_TYPE_FIND_REGISTER (subparse, plugin);
422 
423     g_once_init_leave (&res, TRUE);
424   }
425   return ret;
426 }
427