mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2024-05-18 00:12:46 +00:00
Merge branch 'pachinko' into 'main'
subparse: Support for parsing SubRip font tags See merge request gstreamer/gstreamer!2787
This commit is contained in:
commit
409eb92ffe
|
@ -2,7 +2,7 @@
|
|||
* Copyright (C) <1999> Erik Walthinsen <omega@cse.ogi.edu>
|
||||
* Copyright (C) 2004 Ronald S. Bultje <rbultje@ronald.bitfreak.net>
|
||||
* Copyright (C) 2006 Tim-Philipp Müller <tim centricular net>
|
||||
* Copyright (C) 2016 Philippe Normand <pnormand@igalia.com>
|
||||
* Copyright (C) 2016, 2022 Philippe Normand <philn@igalia.com>
|
||||
* Copyright (C) 2016 Jan Schmidt <jan@centricular.com>
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
|
@ -41,8 +41,11 @@
|
|||
#include "gstsubparseelements.h"
|
||||
|
||||
#define DEFAULT_ENCODING NULL
|
||||
|
||||
#define ATTRIBUTE_REGEX "\\s?[a-zA-Z0-9\\. \t\\(\\)]*"
|
||||
static const gchar *allowed_srt_tags[] = { "i", "b", "u", NULL };
|
||||
#define QUOTED_ATTRIBUTE_REGEX "\\s?[a-zA-Z0-9\\.\t\\(\\)\"=#]*"
|
||||
|
||||
static const gchar *allowed_srt_tags[] = { "i", "b", "u", "font", NULL };
|
||||
static const gchar *allowed_vtt_tags[] =
|
||||
{ "i", "b", "c", "u", "v", "ruby", "rt", NULL };
|
||||
|
||||
|
@ -94,8 +97,10 @@ static GstFlowReturn gst_sub_parse_chain (GstPad * sinkpad, GstObject * parent,
|
|||
G_DEFINE_TYPE (GstSubParse, gst_sub_parse, GST_TYPE_ELEMENT);
|
||||
|
||||
GST_ELEMENT_REGISTER_DEFINE_WITH_CODE (subparse, "subparse",
|
||||
GST_RANK_PRIMARY, GST_TYPE_SUBPARSE, sub_parse_element_init (plugin))
|
||||
static void gst_sub_parse_dispose (GObject * object)
|
||||
GST_RANK_PRIMARY, GST_TYPE_SUBPARSE, sub_parse_element_init (plugin));
|
||||
|
||||
static void
|
||||
gst_sub_parse_dispose (GObject * object)
|
||||
{
|
||||
GstSubParse *subparse = GST_SUBPARSE (object);
|
||||
|
||||
|
@ -644,35 +649,55 @@ subrip_unescape_formatting (gchar * txt, gconstpointer allowed_tags_ptr,
|
|||
gboolean allows_tag_attributes)
|
||||
{
|
||||
gchar *res;
|
||||
gchar *buffer;
|
||||
GRegex *tag_regex;
|
||||
gchar *allowed_tags_pattern, *search_pattern;
|
||||
const gchar *replace_pattern;
|
||||
gboolean cue_has_font_tag = FALSE;
|
||||
|
||||
/* No processing needed if no escaped tag marker found in the string. */
|
||||
if (strstr (txt, "<") == NULL)
|
||||
return;
|
||||
|
||||
cue_has_font_tag = strstr (txt, "<font") != NULL;
|
||||
|
||||
GST_LOG ("SubRip cue before unescape: %s", txt);
|
||||
|
||||
/* Build a list of alternates for our regexp.
|
||||
* FIXME: Could be built once and stored */
|
||||
allowed_tags_pattern = g_strjoinv ("|", (gchar **) allowed_tags_ptr);
|
||||
/* Look for starting/ending escaped tags with optional attributes. */
|
||||
search_pattern = g_strdup_printf ("<(/)?\\ *(%s)(%s)>",
|
||||
allowed_tags_pattern, ATTRIBUTE_REGEX);
|
||||
allowed_tags_pattern,
|
||||
cue_has_font_tag ? QUOTED_ATTRIBUTE_REGEX : ATTRIBUTE_REGEX);
|
||||
GST_LOG ("Search pattern: %s", search_pattern);
|
||||
/* And unescape appropriately */
|
||||
if (allows_tag_attributes) {
|
||||
if (allows_tag_attributes || cue_has_font_tag) {
|
||||
replace_pattern = "<\\1\\2\\3>";
|
||||
|
||||
/* Unquote attributes here because the regex being matched expects it. */
|
||||
tag_regex = g_regex_new (""", 0, 0, NULL);
|
||||
buffer =
|
||||
g_regex_replace_literal (tag_regex, txt, strlen (txt), 0, "\"", 0,
|
||||
NULL);
|
||||
g_regex_unref (tag_regex);
|
||||
} else {
|
||||
replace_pattern = "<\\1\\2>";
|
||||
buffer = g_strdup (txt);
|
||||
}
|
||||
|
||||
tag_regex = g_regex_new (search_pattern, 0, 0, NULL);
|
||||
res = g_regex_replace (tag_regex, txt, strlen (txt), 0,
|
||||
res = g_regex_replace (tag_regex, buffer, strlen (buffer), 0,
|
||||
replace_pattern, 0, NULL);
|
||||
|
||||
g_free (buffer);
|
||||
|
||||
/* res will always be shorter than the input or identical, so this
|
||||
* copy is OK */
|
||||
strcpy (txt, res);
|
||||
|
||||
GST_LOG ("SubRip cue after unescape: %s", txt);
|
||||
|
||||
g_free (res);
|
||||
g_free (search_pattern);
|
||||
g_free (allowed_tags_pattern);
|
||||
|
@ -709,7 +734,8 @@ subrip_remove_unhandled_tags (gchar * txt)
|
|||
gchar *pos, *gt;
|
||||
|
||||
for (pos = txt; pos != NULL && *pos != '\0'; ++pos) {
|
||||
if (strncmp (pos, "<", 4) == 0 && (gt = strstr (pos + 4, ">"))) {
|
||||
if (strncmp (pos, "<", 4) == 0 && strncmp (pos, "<font", 8)
|
||||
&& (gt = strstr (pos + 4, ">"))) {
|
||||
if (subrip_remove_unhandled_tag (pos, gt + strlen (">")))
|
||||
--pos;
|
||||
}
|
||||
|
@ -740,6 +766,7 @@ subrip_fix_up_markup (gchar ** p_txt, gconstpointer allowed_tags_ptr)
|
|||
|
||||
open_tags = g_ptr_array_new_with_free_func (g_free);
|
||||
cur = *p_txt;
|
||||
GST_LOG ("Fixing up markup in SubRip cue: %s", cur);
|
||||
while (*cur != '\0') {
|
||||
next_tag = strchr (cur, '<');
|
||||
if (next_tag == NULL)
|
||||
|
@ -747,9 +774,17 @@ subrip_fix_up_markup (gchar ** p_txt, gconstpointer allowed_tags_ptr)
|
|||
offset = 0;
|
||||
index = 0;
|
||||
while (index < g_strv_length (allowed_tags)) {
|
||||
gboolean has_font_tag = FALSE;
|
||||
|
||||
iter_tag = allowed_tags[index];
|
||||
/* Look for a white listed tag */
|
||||
cur_tag = g_strconcat ("<", iter_tag, ATTRIBUTE_REGEX, ">", NULL);
|
||||
has_font_tag = strncmp (iter_tag, "font", 4) == 0
|
||||
&& strstr (next_tag, "<font");
|
||||
|
||||
/* Look for a allow-listed tag */
|
||||
cur_tag =
|
||||
g_strconcat ("<", iter_tag,
|
||||
has_font_tag ? QUOTED_ATTRIBUTE_REGEX : ATTRIBUTE_REGEX, ">", NULL);
|
||||
GST_LOG ("Looking for tag matching %s in %s", cur_tag, next_tag);
|
||||
tag_regex = g_regex_new (cur_tag, 0, 0, NULL);
|
||||
(void) g_regex_match (tag_regex, next_tag, 0, &match_info);
|
||||
|
||||
|
@ -824,6 +859,46 @@ subrip_fix_up_markup (gchar ** p_txt, gconstpointer allowed_tags_ptr)
|
|||
g_ptr_array_free (open_tags, TRUE);
|
||||
}
|
||||
|
||||
/* <font> is the only SubRip tag that has no direct equivalent to Pango, so
|
||||
* manually convert <font> tags to <span> tags, preserving (optional color)
|
||||
* attribute(s). */
|
||||
static void
|
||||
subrip_to_pango_markup (gchar * read)
|
||||
{
|
||||
gchar *write = read;
|
||||
|
||||
for (guint i = 0; i < 7; i++) {
|
||||
if (read[i] == '\0')
|
||||
return;
|
||||
}
|
||||
|
||||
/* in-place replacement should be safe here because strlen("font") ==
|
||||
* strlen("span") */
|
||||
do {
|
||||
if (strncmp (read, "<font", 5) == 0) {
|
||||
*write++ = '<';
|
||||
*write++ = 's';
|
||||
*write++ = 'p';
|
||||
*write++ = 'a';
|
||||
*write++ = 'n';
|
||||
read += 5;
|
||||
} else if (strncmp (read, "</font>", 7) == 0) {
|
||||
*write++ = '<';
|
||||
*write++ = '/';
|
||||
*write++ = 's';
|
||||
*write++ = 'p';
|
||||
*write++ = 'a';
|
||||
*write++ = 'n';
|
||||
*write++ = '>';
|
||||
read += 7;
|
||||
} else {
|
||||
*write++ = *read++;
|
||||
}
|
||||
} while (*read);
|
||||
|
||||
*write = '\0';
|
||||
}
|
||||
|
||||
static gboolean
|
||||
parse_subrip_time (const gchar * ts_string, GstClockTime * t)
|
||||
{
|
||||
|
@ -1048,6 +1123,7 @@ parse_subrip (ParserState * state, const gchar * line)
|
|||
subrip_remove_unhandled_tags (ret);
|
||||
strip_trailing_newlines (ret);
|
||||
subrip_fix_up_markup (&ret, state->allowed_tags);
|
||||
subrip_to_pango_markup (ret);
|
||||
return ret;
|
||||
}
|
||||
return NULL;
|
||||
|
|
|
@ -108,8 +108,9 @@ static SubParseInputChunk srt_input[] = {
|
|||
"27\n00:06:00,000 --> 00:08:00,000\nRock & Roll\n\n",
|
||||
360 * GST_SECOND, 480 * GST_SECOND, "Rock & Roll"}, {
|
||||
"28\n00:10:00,000 --> 00:11:00,000\n"
|
||||
"<font \"#0000FF\"><joj>This is </xxx>in blue but <5</font>\n\n",
|
||||
600 * GST_SECOND, 660 * GST_SECOND, "This is in blue but <5"}, {
|
||||
"<font color=\"#0000FF\"><joj>This is </xxx>in blue but <5</font>\n\n",
|
||||
600 * GST_SECOND, 660 * GST_SECOND,
|
||||
"<span color=\"#0000FF\">This is in blue but <5</span>"}, {
|
||||
/* closing tags should be recognised properly even if there's a space */
|
||||
"29\n00:11:00,000 --> 00:12:00,000\n" "<i>italics</ i>\n\n",
|
||||
660 * GST_SECOND, 720 * GST_SECOND, "<i>italics</i>"}, {
|
||||
|
@ -203,6 +204,22 @@ static SubParseInputChunk srt_input6[] = {
|
|||
,
|
||||
};
|
||||
|
||||
/* Test SubRip font tags, attributes should not be stripped, <font> tag should
|
||||
be converted to a <span> tag. */
|
||||
static SubParseInputChunk srt_input7[] = {
|
||||
{"1\n00:00:01,000 --> 00:00:02,000\n<font color=\"#EAC118\">Fancy "
|
||||
"yellow</font>\n\n",
|
||||
1 * GST_SECOND, 2 * GST_SECOND,
|
||||
"<span color=\"#EAC118\">Fancy yellow</span>"},
|
||||
{"1\n00:00:01,000 --> 00:00:02,000\n<font color=\"#EAC118\">Fancy "
|
||||
"yellow\n\n",
|
||||
1 * GST_SECOND, 2 * GST_SECOND,
|
||||
"<span color=\"#EAC118\">Fancy yellow</span>"},
|
||||
{"1\n00:00:01,000 --> 00:00:02,000\n<font color=\"yellow\">Fancy "
|
||||
"yellow\n\n",
|
||||
1 * GST_SECOND, 2 * GST_SECOND,
|
||||
"<span color=\"yellow\">Fancy yellow</span>"},
|
||||
};
|
||||
|
||||
static void
|
||||
setup_subparse (void)
|
||||
|
@ -402,6 +419,9 @@ GST_START_TEST (test_srt)
|
|||
|
||||
/* try without an empty line at the end */
|
||||
test_srt_do_test (srt_input6, 0, G_N_ELEMENTS (srt_input6));
|
||||
|
||||
/* try with font tags */
|
||||
test_srt_do_test (srt_input7, 0, G_N_ELEMENTS (srt_input7));
|
||||
}
|
||||
|
||||
GST_END_TEST;
|
||||
|
|
Loading…
Reference in a new issue