1 /* GStreamer
2 * Copyright (C) 2020 Huawei Technologies Co., Ltd.
3 * @Author: Stéphane Cerveau <scerveau@collabora.com>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public
16 * License along with this library; if not, write to the Free
17 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
18 */
19
20 #ifdef HAVE_CONFIG_H
21 #include "config.h"
22 #endif
23
24 #include <stdio.h>
25
26 #include "gstsubparseelements.h"
27
28 GST_DEBUG_CATEGORY (sub_parse_debug);
29
30 /* regex type enum */
31 typedef enum
32 {
33 GST_SUB_PARSE_REGEX_UNKNOWN = 0,
34 GST_SUB_PARSE_REGEX_MDVDSUB = 1,
35 GST_SUB_PARSE_REGEX_SUBRIP = 2,
36 GST_SUB_PARSE_REGEX_DKS = 3,
37 GST_SUB_PARSE_REGEX_VTT = 4,
38 } GstSubParseRegex;
39
40 static gpointer
gst_sub_parse_data_format_autodetect_regex_once(GstSubParseRegex regtype)41 gst_sub_parse_data_format_autodetect_regex_once (GstSubParseRegex regtype)
42 {
43 gpointer result = NULL;
44 GError *gerr = NULL;
45 switch (regtype) {
46 case GST_SUB_PARSE_REGEX_MDVDSUB:
47 result =
48 (gpointer) g_regex_new ("^\\{[0-9]+\\}\\{[0-9]+\\}",
49 G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr);
50 if (result == NULL) {
51 g_warning ("Compilation of mdvd regex failed: %s", gerr->message);
52 g_clear_error (&gerr);
53 }
54 break;
55 case GST_SUB_PARSE_REGEX_SUBRIP:
56 result = (gpointer)
57 g_regex_new ("^[\\s\\n]*[\\n]? {0,3}[ 0-9]{1,4}\\s*(\x0d)?\x0a"
58 " ?[0-9]{1,2}: ?[0-9]{1,2}: ?[0-9]{1,2}[,.] {0,2}[0-9]{1,3}"
59 " +--> +[0-9]{1,2}: ?[0-9]{1,2}: ?[0-9]{1,2}[,.] {0,2}[0-9]{1,2}",
60 G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr);
61 if (result == NULL) {
62 g_warning ("Compilation of subrip regex failed: %s", gerr->message);
63 g_clear_error (&gerr);
64 }
65 break;
66 case GST_SUB_PARSE_REGEX_DKS:
67 result = (gpointer) g_regex_new ("^\\[[0-9]+:[0-9]+:[0-9]+\\].*",
68 G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr);
69 if (result == NULL) {
70 g_warning ("Compilation of dks regex failed: %s", gerr->message);
71 g_clear_error (&gerr);
72 }
73 break;
74 case GST_SUB_PARSE_REGEX_VTT:
75 result = (gpointer)
76 g_regex_new ("^(\\xef\\xbb\\xbf)?WEBVTT[\\xa\\xd\\x20\\x9]", 0, 0,
77 &gerr);
78 if (result == NULL) {
79 g_warning ("Compilation of vtt regex failed: %s", gerr->message);
80 g_error_free (gerr);
81 }
82 break;
83
84 default:
85 GST_WARNING ("Trying to allocate regex of unknown type %u", regtype);
86 }
87 return result;
88 }
89
90 /*
91 * FIXME: maybe we should pass along a second argument, the preceding
92 * text buffer, because that is how this originally worked, even though
93 * I don't really see the use of that.
94 */
95
96 GstSubParseFormat
gst_sub_parse_data_format_autodetect(gchar * match_str)97 gst_sub_parse_data_format_autodetect (gchar * match_str)
98 {
99 guint n1, n2, n3;
100
101 static GOnce mdvd_rx_once = G_ONCE_INIT;
102 static GOnce subrip_rx_once = G_ONCE_INIT;
103 static GOnce dks_rx_once = G_ONCE_INIT;
104 static GOnce vtt_rx_once = G_ONCE_INIT;
105
106 GRegex *mdvd_grx;
107 GRegex *subrip_grx;
108 GRegex *dks_grx;
109 GRegex *vtt_grx;
110
111 g_once (&mdvd_rx_once,
112 (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
113 (gpointer) GST_SUB_PARSE_REGEX_MDVDSUB);
114 g_once (&subrip_rx_once,
115 (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
116 (gpointer) GST_SUB_PARSE_REGEX_SUBRIP);
117 g_once (&dks_rx_once,
118 (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
119 (gpointer) GST_SUB_PARSE_REGEX_DKS);
120 g_once (&vtt_rx_once,
121 (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
122 (gpointer) GST_SUB_PARSE_REGEX_VTT);
123
124 mdvd_grx = (GRegex *) mdvd_rx_once.retval;
125 subrip_grx = (GRegex *) subrip_rx_once.retval;
126 dks_grx = (GRegex *) dks_rx_once.retval;
127 vtt_grx = (GRegex *) vtt_rx_once.retval;
128
129 if (g_regex_match (mdvd_grx, match_str, 0, NULL)) {
130 GST_LOG ("MicroDVD (frame based) format detected");
131 return GST_SUB_PARSE_FORMAT_MDVDSUB;
132 }
133 if (g_regex_match (subrip_grx, match_str, 0, NULL)) {
134 GST_LOG ("SubRip (time based) format detected");
135 return GST_SUB_PARSE_FORMAT_SUBRIP;
136 }
137 if (g_regex_match (dks_grx, match_str, 0, NULL)) {
138 GST_LOG ("DKS (time based) format detected");
139 return GST_SUB_PARSE_FORMAT_DKS;
140 }
141 if (g_regex_match (vtt_grx, match_str, 0, NULL) == TRUE) {
142 GST_LOG ("WebVTT (time based) format detected");
143 return GST_SUB_PARSE_FORMAT_VTT;
144 }
145
146 if (!strncmp (match_str, "FORMAT=TIME", 11)) {
147 GST_LOG ("MPSub (time based) format detected");
148 return GST_SUB_PARSE_FORMAT_MPSUB;
149 }
150 if (strstr (match_str, "<SAMI>") != NULL ||
151 strstr (match_str, "<sami>") != NULL) {
152 GST_LOG ("SAMI (time based) format detected");
153 return GST_SUB_PARSE_FORMAT_SAMI;
154 }
155 /* we're boldly assuming the first subtitle appears within the first hour */
156 if (sscanf (match_str, "0:%02u:%02u:", &n1, &n2) == 2 ||
157 sscanf (match_str, "0:%02u:%02u=", &n1, &n2) == 2 ||
158 sscanf (match_str, "00:%02u:%02u:", &n1, &n2) == 2 ||
159 sscanf (match_str, "00:%02u:%02u=", &n1, &n2) == 2 ||
160 sscanf (match_str, "00:%02u:%02u,%u=", &n1, &n2, &n3) == 3) {
161 GST_LOG ("TMPlayer (time based) format detected");
162 return GST_SUB_PARSE_FORMAT_TMPLAYER;
163 }
164 if (sscanf (match_str, "[%u][%u]", &n1, &n2) == 2) {
165 GST_LOG ("MPL2 (time based) format detected");
166 return GST_SUB_PARSE_FORMAT_MPL2;
167 }
168 if (strstr (match_str, "[INFORMATION]") != NULL) {
169 GST_LOG ("SubViewer (time based) format detected");
170 return GST_SUB_PARSE_FORMAT_SUBVIEWER;
171 }
172 if (strstr (match_str, "{QTtext}") != NULL) {
173 GST_LOG ("QTtext (time based) format detected");
174 return GST_SUB_PARSE_FORMAT_QTTEXT;
175 }
176 /* We assume the LRC file starts immediately */
177 if (match_str[0] == '[') {
178 gboolean all_lines_good = TRUE;
179 gchar **split;
180 gchar **ptr;
181
182 ptr = split = g_strsplit (match_str, "\n", -1);
183 while (*ptr && *(ptr + 1)) {
184 gchar *str = *ptr;
185 gint len = strlen (str);
186
187 if (sscanf (str, "[%u:%02u.%02u]", &n1, &n2, &n3) == 3 ||
188 sscanf (str, "[%u:%02u.%03u]", &n1, &n2, &n3) == 3) {
189 all_lines_good = TRUE;
190 } else if (len > 0 && str[len - 1] == ']' && strchr (str, ':') != NULL) {
191 all_lines_good = TRUE;
192 } else {
193 all_lines_good = FALSE;
194 break;
195 }
196
197 ptr++;
198 }
199 g_strfreev (split);
200
201 if (all_lines_good)
202 return GST_SUB_PARSE_FORMAT_LRC;
203 }
204
205 GST_DEBUG ("no subtitle format detected");
206 return GST_SUB_PARSE_FORMAT_UNKNOWN;
207 }
208
209 gchar *
gst_sub_parse_gst_convert_to_utf8(const gchar * str,gsize len,const gchar * encoding,gsize * consumed,GError ** err)210 gst_sub_parse_gst_convert_to_utf8 (const gchar * str, gsize len,
211 const gchar * encoding, gsize * consumed, GError ** err)
212 {
213 gchar *ret = NULL;
214
215 *consumed = 0;
216 /* The char cast is necessary in glib < 2.24 */
217 ret =
218 g_convert_with_fallback (str, len, "UTF-8", encoding, (char *) "*",
219 consumed, NULL, err);
220 if (ret == NULL)
221 return ret;
222
223 /* + 3 to skip UTF-8 BOM if it was added */
224 len = strlen (ret);
225 if (len >= 3 && (guint8) ret[0] == 0xEF && (guint8) ret[1] == 0xBB
226 && (guint8) ret[2] == 0xBF)
227 memmove (ret, ret + 3, len + 1 - 3);
228
229 return ret;
230 }
231
232 gchar *
gst_sub_parse_detect_encoding(const gchar * str,gsize len)233 gst_sub_parse_detect_encoding (const gchar * str, gsize len)
234 {
235 if (len >= 3 && (guint8) str[0] == 0xEF && (guint8) str[1] == 0xBB
236 && (guint8) str[2] == 0xBF)
237 return g_strdup ("UTF-8");
238
239 if (len >= 2 && (guint8) str[0] == 0xFE && (guint8) str[1] == 0xFF)
240 return g_strdup ("UTF-16BE");
241
242 if (len >= 2 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE)
243 return g_strdup ("UTF-16LE");
244
245 if (len >= 4 && (guint8) str[0] == 0x00 && (guint8) str[1] == 0x00
246 && (guint8) str[2] == 0xFE && (guint8) str[3] == 0xFF)
247 return g_strdup ("UTF-32BE");
248
249 if (len >= 4 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE
250 && (guint8) str[2] == 0x00 && (guint8) str[3] == 0x00)
251 return g_strdup ("UTF-32LE");
252
253 return NULL;
254 }
255
256 /*
257 * Typefind support.
258 */
259
260 /* FIXME 0.11: these caps are ugly, use app/x-subtitle + type field or so;
261 * also, give different subtitle formats really different types */
262 static GstStaticCaps mpl2_caps =
263 GST_STATIC_CAPS ("application/x-subtitle-mpl2");
264 #define SUB_CAPS (gst_static_caps_get (&sub_caps))
265
266 static GstStaticCaps tmp_caps =
267 GST_STATIC_CAPS ("application/x-subtitle-tmplayer");
268 #define TMP_CAPS (gst_static_caps_get (&tmp_caps))
269
270 static GstStaticCaps sub_caps = GST_STATIC_CAPS ("application/x-subtitle");
271 #define MPL2_CAPS (gst_static_caps_get (&mpl2_caps))
272
273 static GstStaticCaps smi_caps = GST_STATIC_CAPS ("application/x-subtitle-sami");
274 #define SAMI_CAPS (gst_static_caps_get (&smi_caps))
275
276 static GstStaticCaps dks_caps = GST_STATIC_CAPS ("application/x-subtitle-dks");
277 #define DKS_CAPS (gst_static_caps_get (&dks_caps))
278
279 static GstStaticCaps vtt_caps = GST_STATIC_CAPS ("application/x-subtitle-vtt");
280 #define VTT_CAPS (gst_static_caps_get (&vtt_caps))
281
282 static GstStaticCaps qttext_caps =
283 GST_STATIC_CAPS ("application/x-subtitle-qttext");
284 #define QTTEXT_CAPS (gst_static_caps_get (&qttext_caps))
285
286 static GstStaticCaps lrc_caps = GST_STATIC_CAPS ("application/x-subtitle-lrc");
287 #define LRC_CAPS (gst_static_caps_get (&lrc_caps))
288
289 static void
gst_sub_parse_type_find(GstTypeFind * tf,gpointer private)290 gst_sub_parse_type_find (GstTypeFind * tf, gpointer private)
291 {
292 GstSubParseFormat format;
293 const guint8 *data;
294 GstCaps *caps;
295 gchar *str;
296 gchar *encoding = NULL;
297 const gchar *end;
298
299 if (!(data = gst_type_find_peek (tf, 0, 129)))
300 return;
301
302 /* make sure string passed to _autodetect() is NUL-terminated */
303 str = g_malloc0 (129);
304 memcpy (str, data, 128);
305
306 if ((encoding = gst_sub_parse_detect_encoding (str, 128)) != NULL) {
307 gchar *converted_str;
308 GError *err = NULL;
309 gsize tmp;
310
311 converted_str =
312 gst_sub_parse_gst_convert_to_utf8 (str, 128, encoding, &tmp, &err);
313 if (converted_str == NULL) {
314 GST_DEBUG ("Encoding '%s' detected but conversion failed: %s", encoding,
315 err->message);
316 g_clear_error (&err);
317 } else {
318 g_free (str);
319 str = converted_str;
320 }
321 g_free (encoding);
322 }
323
324 /* Check if at least the first 120 chars are valid UTF8,
325 * otherwise convert as always */
326 if (!g_utf8_validate (str, 128, &end) && (end - str) < 120) {
327 gchar *converted_str;
328 gsize tmp;
329 const gchar *enc;
330
331 enc = g_getenv ("GST_SUBTITLE_ENCODING");
332 if (enc == NULL || *enc == '\0') {
333 /* if local encoding is UTF-8 and no encoding specified
334 * via the environment variable, assume ISO-8859-15 */
335 if (g_get_charset (&enc)) {
336 enc = "ISO-8859-15";
337 }
338 }
339 converted_str =
340 gst_sub_parse_gst_convert_to_utf8 (str, 128, enc, &tmp, NULL);
341 if (converted_str != NULL) {
342 g_free (str);
343 str = converted_str;
344 }
345 }
346
347 format = gst_sub_parse_data_format_autodetect (str);
348 g_free (str);
349
350 switch (format) {
351 case GST_SUB_PARSE_FORMAT_MDVDSUB:
352 GST_DEBUG ("MicroDVD format detected");
353 caps = SUB_CAPS;
354 break;
355 case GST_SUB_PARSE_FORMAT_SUBRIP:
356 GST_DEBUG ("SubRip format detected");
357 caps = SUB_CAPS;
358 break;
359 case GST_SUB_PARSE_FORMAT_MPSUB:
360 GST_DEBUG ("MPSub format detected");
361 caps = SUB_CAPS;
362 break;
363 case GST_SUB_PARSE_FORMAT_SAMI:
364 GST_DEBUG ("SAMI (time-based) format detected");
365 caps = SAMI_CAPS;
366 break;
367 case GST_SUB_PARSE_FORMAT_TMPLAYER:
368 GST_DEBUG ("TMPlayer (time based) format detected");
369 caps = TMP_CAPS;
370 break;
371 /* FIXME: our MPL2 typefinding is not really good enough to warrant
372 * returning a high probability (however, since we registered our
373 * typefinder here with a rank of MARGINAL we should pretty much only
374 * be called if most other typefinders have already run */
375 case GST_SUB_PARSE_FORMAT_MPL2:
376 GST_DEBUG ("MPL2 (time based) format detected");
377 caps = MPL2_CAPS;
378 break;
379 case GST_SUB_PARSE_FORMAT_SUBVIEWER:
380 GST_DEBUG ("SubViewer format detected");
381 caps = SUB_CAPS;
382 break;
383 case GST_SUB_PARSE_FORMAT_DKS:
384 GST_DEBUG ("DKS format detected");
385 caps = DKS_CAPS;
386 break;
387 case GST_SUB_PARSE_FORMAT_QTTEXT:
388 GST_DEBUG ("QTtext format detected");
389 caps = QTTEXT_CAPS;
390 break;
391 case GST_SUB_PARSE_FORMAT_LRC:
392 GST_DEBUG ("LRC format detected");
393 caps = LRC_CAPS;
394 break;
395 case GST_SUB_PARSE_FORMAT_VTT:
396 GST_DEBUG ("WebVTT format detected");
397 caps = VTT_CAPS;
398 break;
399 default:
400 case GST_SUB_PARSE_FORMAT_UNKNOWN:
401 GST_DEBUG ("no subtitle format detected");
402 return;
403 }
404
405 /* if we're here, it's ok */
406 gst_type_find_suggest (tf, GST_TYPE_FIND_MAXIMUM, caps);
407 }
408
409 GST_TYPE_FIND_REGISTER_DEFINE (subparse, "subparse_typefind", GST_RANK_MARGINAL,
410 gst_sub_parse_type_find, "srt,sub,mpsub,mdvd,smi,txt,dks,vtt", SUB_CAPS,
411 NULL, NULL)
412
413 gboolean
sub_parse_element_init(GstPlugin * plugin)414 sub_parse_element_init (GstPlugin * plugin)
415 {
416 static gsize res = FALSE;
417 gboolean ret = TRUE;
418 if (g_once_init_enter (&res)) {
419 GST_DEBUG_CATEGORY_INIT (sub_parse_debug, "subparse", 0, ".sub parser");
420
421 ret |= GST_TYPE_FIND_REGISTER (subparse, plugin);
422
423 g_once_init_leave (&res, TRUE);
424 }
425 return ret;
426 }
427