diff --git a/subprojects/gst-plugins-good/gst/audioparsers/gstmpegaudioparse.c b/subprojects/gst-plugins-good/gst/audioparsers/gstmpegaudioparse.c index f5e0fc2290..521ed7ec3c 100644 --- a/subprojects/gst-plugins-good/gst/audioparsers/gstmpegaudioparse.c +++ b/subprojects/gst-plugins-good/gst/audioparsers/gstmpegaudioparse.c @@ -35,6 +35,84 @@ * */ +/* Notes about gapless playback, "Frankenstein" streams, and the Xing header frame: + * + * Gapless playback is based on the LAME tag, which is located in the Xing + * header frame. The tag contains the encoder delay and encoder padding. + * The encoder delay specifies how many padding nullsamples have been prepended + * by the encoder at the start of the mp3 stream, while the encoder padding + * specifies how many padding nullsamples got added at the end of the stream. + * + * In addition, there is also a "decoder delay". This affects all existing + * mp3 decoders - they themselves introduce a delay into the signal due to + * the way mp3 decoding works. This delay is 529 samples long in all known + * decoders. Unlike the encoder delay, the decoder delay is not specified + * anywhere in the mp3 stream. Players/decoders therefore hardcode the + * decoder delay as 529 samples. + * + * (The LAME tech FAQ mentions 528 samples instead of 529, but LAME seems to + * use 529 samples. Also, decoders like mpg123 use 529 samples instead of 528. + * The situation is a little unclear, but 529 samples seems to be standard.) + * + * For proper gapless playback, both mpegaudioparse and a downstream MPEG + * audio decoder must do their part. mpegaudioparse adjusts buffer PTS/DTS + * and durations, and adds GstAudioClippingMeta to outgoing buffers if + * clipping is necessary. MPEG decoders then clip decoded frames according + * to that meta (if present). + * + * To detect when to add GstAudioClippingMeta and when to adjust PTS/DTS/ + * durations, the number of the current frame is retrieved. Based on that, the + * current stream position in samples is calculated. With the sample position, + * it is determined whether or not the current playback position is still + * if the actual playback range (= in the actual playback range of the stream + * that excludes padding samples), or if it is already outside, or partially + * outside. + * + * start_of_actual_samples and end_of_actual_samples define the start/end + * of this actual playback range, in samples. So: + * If sample_pos >= start_of_actual_samples and sample_pos end_of_actual_samples + * -> sample_pos is inside the actual playback range. + * + * (The decoder delay could in theory be left for the decoder to worry + * about. But then, the decoder would also have to adjust PTS/DTS/durations + * of decoded buffers, which is not something a GstAudioDecoder based element + * should have to deal with. So, for convenience, mpegaudioparse also factors + * that delay into its calculations.) + * + * + * "Frankenstein" streams are MPEG streams which have streams beyond + * what the Xing metadata indicates. Such streams typically are the + * result of poorly stitching individual mp3s together, like this: + * + * cat first.mp3 second.mp3 > joined.mp3 + * + * The resulting mp3 is not guaranteed to be valid. In particular, this can + * cause confusion when first.mp3 contains a Xing header frame. Its length + * indicator then does not match the actual length (which is bigger). When + * this is detected, a log line about this being a Frankenstein stream is + * generated. + * + * + * Xing header frames are empty dummy MPEG frames. They only exist for + * supplying metadata. They are encoded as valid silent MPEG frames for + * backwards compatibility with older hardware MP3 players, but can be safely + * dropped. + * + * For more about Xng header frames, see: + * https://www.codeproject.com/Articles/8295/MPEG-Audio-Frame-Header#XINGHeader + * https://www.compuphase.com/mp3/mp3loops.htm#PADDING_DELAYS + * + * To facilitate gapless playback and ensure that MPEG audio decoders don't + * actually decode this frame as an empty MPEG frame, it is marked here as + * GST_BUFFER_FLAG_DECODE_ONLY / GST_BUFFER_FLAG_DROPPABLE in mpegaudioparse + * after its metadata got extracted. It is also marked as such if it is + * encountered again after the user for example seeked back to the beginning + * of the mp3 stream. Its duration is also set to zero to make sure that the + * frame does not cause baseparse to increment the timestamp of the frame that + * follows this one. + * + */ + /* FIXME: we should make the base class (GstBaseParse) aware of the * XING seek table somehow, so it can use it properly for things like * accurate seeks. Currently it can only do a lookup via the convert function, @@ -98,12 +176,20 @@ static GstFlowReturn gst_mpeg_audio_parse_handle_frame (GstBaseParse * parse, GstBaseParseFrame * frame, gint * skipsize); static GstFlowReturn gst_mpeg_audio_parse_pre_push_frame (GstBaseParse * parse, GstBaseParseFrame * frame); +static gboolean gst_mpeg_audio_parse_src_query (GstBaseParse * parse, + GstQuery * query); +static gboolean gst_mpeg_audio_parse_sink_event (GstBaseParse * parse, + GstEvent * event); static gboolean gst_mpeg_audio_parse_convert (GstBaseParse * parse, GstFormat src_format, gint64 src_value, GstFormat dest_format, gint64 * dest_value); static GstCaps *gst_mpeg_audio_parse_get_sink_caps (GstBaseParse * parse, GstCaps * filter); +static gboolean +gst_mpeg_audio_parse_check_if_is_xing_header_frame (GstMpegAudioParse * + mp3parse, GstBuffer * buf); + static void gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse, GstBuffer * buf); @@ -166,6 +252,8 @@ gst_mpeg_audio_parse_class_init (GstMpegAudioParseClass * klass) GST_DEBUG_FUNCPTR (gst_mpeg_audio_parse_handle_frame); parse_class->pre_push_frame = GST_DEBUG_FUNCPTR (gst_mpeg_audio_parse_pre_push_frame); + parse_class->src_query = GST_DEBUG_FUNCPTR (gst_mpeg_audio_parse_src_query); + parse_class->sink_event = GST_DEBUG_FUNCPTR (gst_mpeg_audio_parse_sink_event); parse_class->convert = GST_DEBUG_FUNCPTR (gst_mpeg_audio_parse_convert); parse_class->get_sink_caps = GST_DEBUG_FUNCPTR (gst_mpeg_audio_parse_get_sink_caps); @@ -194,12 +282,16 @@ gst_mpeg_audio_parse_class_init (GstMpegAudioParseClass * klass) static void gst_mpeg_audio_parse_reset (GstMpegAudioParse * mp3parse) { + mp3parse->upstream_format = GST_FORMAT_UNDEFINED; mp3parse->channels = -1; mp3parse->rate = -1; mp3parse->sent_codec_tag = FALSE; mp3parse->last_posted_crc = CRC_UNKNOWN; mp3parse->last_posted_channel_mode = MPEG_AUDIO_CHANNEL_MODE_UNKNOWN; mp3parse->freerate = 0; + mp3parse->spf = 0; + + mp3parse->outgoing_frame_is_xing_header = FALSE; mp3parse->hdr_bitrate = 0; mp3parse->bitrate_is_constant = TRUE; @@ -224,6 +316,12 @@ gst_mpeg_audio_parse_reset (GstMpegAudioParse * mp3parse) mp3parse->encoder_delay = 0; mp3parse->encoder_padding = 0; + mp3parse->decoder_delay = 0; + mp3parse->start_of_actual_samples = 0; + mp3parse->end_of_actual_samples = 0; + mp3parse->total_padding_time = GST_CLOCK_TIME_NONE; + mp3parse->start_padding_time = GST_CLOCK_TIME_NONE; + mp3parse->end_padding_time = GST_CLOCK_TIME_NONE; } static void @@ -745,6 +843,11 @@ gst_mpeg_audio_parse_handle_frame (GstBaseParse * parse, mp3parse->spf = 576; } + /* We need the frame duration for calculating the frame number later + * in gst_mpeg_audio_parse_pre_push_frame (). */ + mp3parse->frame_duration = gst_util_uint64_scale (GST_SECOND, + mp3parse->spf, mp3parse->rate); + /* lead_in: * We start pushing 9 frames earlier (29 frames for MPEG2) than * segment start to be able to decode the first frame we want. @@ -764,6 +867,21 @@ gst_mpeg_audio_parse_handle_frame (GstBaseParse * parse, } mp3parse->hdr_bitrate = bitrate; + /* While during normal playback, the Xing header frame is seen only once + * (right at the beginning), we may see it again if the user seeked back + * to the beginning. To make sure it is dropped again and NOT pushed + * downstream, we have to check every frame for Xing IDs. + * + * (sent_codec_tag is TRUE after this Xing frame got parsed.) */ + if (G_LIKELY (mp3parse->sent_codec_tag)) { + if (G_UNLIKELY (gst_mpeg_audio_parse_check_if_is_xing_header_frame + (mp3parse, buf))) { + GST_DEBUG_OBJECT (mp3parse, "This is a Xing header frame, which " + "contains no meaningful audio data, and can be safely dropped"); + mp3parse->outgoing_frame_is_xing_header = TRUE; + } + } + /* For first frame; check for seek tables and output a codec tag */ gst_mpeg_audio_parse_handle_first_frame (mp3parse, buf); @@ -774,6 +892,17 @@ gst_mpeg_audio_parse_handle_frame (GstBaseParse * parse, cleanup: gst_buffer_unmap (buf, &map); + /* We don't actually drop the frame right here, but rather in + * gst_mpeg_audio_parse_pre_push_frame (), since it is still important + * to let other code bits do their work there even if we want to drop + * the current frame. */ + if (G_UNLIKELY (mp3parse->outgoing_frame_is_xing_header)) { + frame->flags |= GST_BASE_PARSE_FRAME_FLAG_NO_FRAME; + /* Set duration to zero to prevent the baseparse class + * from incrementing outgoing timestamps */ + GST_BUFFER_DURATION (frame->buffer) = 0; + } + if (res && bpf <= map.size) { return gst_base_parse_finish_frame (parse, frame, bpf); } @@ -781,6 +910,54 @@ cleanup: return GST_FLOW_OK; } +static gboolean +gst_mpeg_audio_parse_check_if_is_xing_header_frame (GstMpegAudioParse * + mp3parse, GstBuffer * buf) +{ + /* TODO: get rid of code duplication + * (see gst_mpeg_audio_parse_handle_first_frame ()) */ + + const guint32 xing_id = 0x58696e67; /* 'Xing' in hex */ + const guint32 info_id = 0x496e666f; /* 'Info' in hex - found in LAME CBR files */ + + gint offset_xing; + GstMapInfo map; + guint8 *data; + guint64 avail; + guint32 read_id_xing = 0; + gboolean ret = FALSE; + + /* Check first frame for Xing info */ + if (mp3parse->version == 1) { /* MPEG-1 file */ + if (mp3parse->channels == 1) + offset_xing = 0x11; + else + offset_xing = 0x20; + } else { /* MPEG-2 header */ + if (mp3parse->channels == 1) + offset_xing = 0x09; + else + offset_xing = 0x11; + } + + /* Skip the 4 bytes of the MP3 header too */ + offset_xing += 4; + + /* Check if we have enough data to read the Xing header */ + gst_buffer_map (buf, &map, GST_MAP_READ); + data = map.data; + avail = map.size; + + if (avail >= offset_xing + 4) { + read_id_xing = GST_READ_UINT32_BE (data + offset_xing); + ret = (read_id_xing == xing_id || read_id_xing == info_id); + } + + gst_buffer_unmap (buf, &map); + + return ret; +} + static void gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse, GstBuffer * buf) @@ -841,10 +1018,15 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse, guint32 xing_flags; guint bytes_needed = offset_xing + 8; gint64 total_bytes; + guint64 num_xing_samples = 0; GstClockTime total_time; GST_DEBUG_OBJECT (mp3parse, "Found Xing header marker 0x%x", xing_id); + GST_DEBUG_OBJECT (mp3parse, "This is a Xing header frame, which contains " + "no meaningful audio data, and can be safely dropped"); + mp3parse->outgoing_frame_is_xing_header = TRUE; + /* Move data after Xing header */ data += offset_xing + 4; @@ -875,9 +1057,9 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse, "Invalid number of frames in Xing header"); mp3parse->xing_flags &= ~XING_FRAMES_FLAG; } else { + num_xing_samples = (guint64) (mp3parse->xing_frames) * (mp3parse->spf); mp3parse->xing_total_time = gst_util_uint64_scale (GST_SECOND, - (guint64) (mp3parse->xing_frames) * (mp3parse->spf), - mp3parse->rate); + num_xing_samples, mp3parse->rate); } data += 4; @@ -886,6 +1068,10 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse, mp3parse->xing_total_time = 0; } + /* Store the entire time as actual total time for now. Should there be + * any padding present, this value will get adjusted accordingly. */ + mp3parse->xing_actual_total_time = mp3parse->xing_total_time; + if (xing_flags & XING_BYTES_FLAG) { mp3parse->xing_bytes = GST_READ_UINT32_BE (data); if (mp3parse->xing_bytes == 0) { @@ -967,8 +1153,10 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse, } else mp3parse->xing_vbr_scale = 0; - GST_DEBUG_OBJECT (mp3parse, "Xing header reported %u frames, time %" - GST_TIME_FORMAT ", %u bytes, vbr scale %u", mp3parse->xing_frames, + GST_DEBUG_OBJECT (mp3parse, "Xing header reported %u frames, %" + G_GUINT64_FORMAT " samples, time %" GST_TIME_FORMAT + " (this includes potentially present padding data), %u bytes," + " vbr scale %u", mp3parse->xing_frames, num_xing_samples, GST_TIME_ARGS (mp3parse->xing_total_time), mp3parse->xing_bytes, mp3parse->xing_vbr_scale); @@ -986,6 +1174,8 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse, gchar lame_version[10] = { 0, }; guint tag_rev; guint32 encoder_delay, encoder_padding; + guint64 total_padding_samples; + guint64 actual_num_xing_samples; memcpy (lame_version, data, 9); data += 9; @@ -1001,11 +1191,63 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse, encoder_padding = GST_READ_UINT24_BE (data); encoder_padding &= 0x000fff; + total_padding_samples = encoder_delay + encoder_padding; + mp3parse->encoder_delay = encoder_delay; mp3parse->encoder_padding = encoder_padding; - GST_DEBUG_OBJECT (mp3parse, "Encoder delay %u, encoder padding %u", - encoder_delay, encoder_padding); + /* As mentioned in the overview at the beginning of this source + * file, decoders exhibit a delay of 529 samples. */ + mp3parse->decoder_delay = 529; + + /* Where the actual, non-padding samples start & end, in sample offsets. */ + mp3parse->start_of_actual_samples = mp3parse->encoder_delay + + mp3parse->decoder_delay; + mp3parse->end_of_actual_samples = num_xing_samples + + mp3parse->decoder_delay - mp3parse->encoder_padding; + + /* Length of padding at the start and at the end of the stream, + * in nanoseconds. */ + mp3parse->start_padding_time = gst_util_uint64_scale_int (GST_SECOND, + mp3parse->start_of_actual_samples, mp3parse->rate); + mp3parse->end_padding_time = mp3parse->xing_total_time - + gst_util_uint64_scale_int (mp3parse->end_of_actual_samples, + GST_SECOND, mp3parse->rate); + + /* Total length of all combined padding samples, in nanoseconds. */ + mp3parse->total_padding_time = gst_util_uint64_scale_int (GST_SECOND, + total_padding_samples, mp3parse->rate); + + /* Length of media, in samples, without the number of padding samples. */ + actual_num_xing_samples = (num_xing_samples >= total_padding_samples) ? + (num_xing_samples - total_padding_samples) : 0; + /* Length of media, converted to nanoseconds. This is used for setting + * baseparse's duration. */ + mp3parse->xing_actual_total_time = gst_util_uint64_scale (GST_SECOND, + actual_num_xing_samples, mp3parse->rate); + + GST_DEBUG_OBJECT (mp3parse, "Encoder delay: %u samples", + mp3parse->encoder_delay); + GST_DEBUG_OBJECT (mp3parse, "Encoder padding: %u samples", + mp3parse->encoder_padding); + GST_DEBUG_OBJECT (mp3parse, "Decoder delay: %u samples", + mp3parse->decoder_delay); + GST_DEBUG_OBJECT (mp3parse, "Start of actual samples: %" + G_GUINT64_FORMAT, mp3parse->start_of_actual_samples); + GST_DEBUG_OBJECT (mp3parse, "End of actual samples: %" + G_GUINT64_FORMAT, mp3parse->end_of_actual_samples); + GST_DEBUG_OBJECT (mp3parse, "Total padding samples: %" G_GUINT64_FORMAT, + total_padding_samples); + GST_DEBUG_OBJECT (mp3parse, "Start padding time: %" GST_TIME_FORMAT, + GST_TIME_ARGS (mp3parse->start_padding_time)); + GST_DEBUG_OBJECT (mp3parse, "End padding time: %" GST_TIME_FORMAT, + GST_TIME_ARGS (mp3parse->end_padding_time)); + GST_DEBUG_OBJECT (mp3parse, "Total padding time: %" GST_TIME_FORMAT, + GST_TIME_ARGS (mp3parse->total_padding_time)); + GST_DEBUG_OBJECT (mp3parse, "Actual total media samples: %" + G_GUINT64_FORMAT, actual_num_xing_samples); + GST_DEBUG_OBJECT (mp3parse, "Actual total media length: %" + GST_TIME_FORMAT, GST_TIME_ARGS (mp3parse->xing_actual_total_time)); } } else if (read_id_vbri == vbri_id) { gint64 total_bytes, total_frames; @@ -1143,7 +1385,7 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse, /* set duration if tables provided a valid one */ if (mp3parse->xing_flags & XING_FRAMES_FLAG) { gst_base_parse_set_duration (GST_BASE_PARSE (mp3parse), GST_FORMAT_TIME, - mp3parse->xing_total_time, 0); + mp3parse->xing_actual_total_time, 0); } if (mp3parse->vbri_total_time != 0 && mp3parse->vbri_valid) { gst_base_parse_set_duration (GST_BASE_PARSE (mp3parse), GST_FORMAT_TIME, @@ -1318,6 +1560,91 @@ gst_mpeg_audio_parse_bytepos_to_time (GstMpegAudioParse * mp3parse, return FALSE; } +static gboolean +gst_mpeg_audio_parse_src_query (GstBaseParse * parse, GstQuery * query) +{ + gboolean res = FALSE; + GstMpegAudioParse *mp3parse = GST_MPEG_AUDIO_PARSE (parse); + + res = GST_BASE_PARSE_CLASS (parent_class)->src_query (parse, query); + if (!res) + return FALSE; + + /* If upstream operates in BYTE format then consider any parsed Xing/LAME + * header to remove encoder/decoder delay and padding samples from the + * position query. */ + if (mp3parse->upstream_format == GST_FORMAT_BYTES + || GST_PAD_MODE (GST_BASE_PARSE_SINK_PAD (parse)) == GST_PAD_MODE_PULL) { + switch (GST_QUERY_TYPE (query)) { + case GST_QUERY_POSITION:{ + GstFormat format; + gint64 position, new_position; + GstClockTime duration_to_skip; + gst_query_parse_position (query, &format, &position); + + /* Adjust the position to exclude padding samples. */ + + if ((position < 0) || (format != GST_FORMAT_TIME)) + break; + + duration_to_skip = mp3parse->frame_duration + + mp3parse->start_padding_time; + + if (position < duration_to_skip) + new_position = 0; + else + new_position = position - duration_to_skip; + + if (new_position > (mp3parse->xing_actual_total_time)) + new_position = mp3parse->xing_actual_total_time; + + GST_LOG_OBJECT (mp3parse, "applying gapless padding info to position " + "query response: %" GST_TIME_FORMAT " -> %" GST_TIME_FORMAT, + GST_TIME_ARGS (position), GST_TIME_ARGS (new_position)); + + gst_query_set_position (query, GST_FORMAT_TIME, new_position); + + break; + } + + default: + break; + } + } + + return res; +} + +static gboolean +gst_mpeg_audio_parse_sink_event (GstBaseParse * parse, GstEvent * event) +{ + gboolean res = FALSE; + GstMpegAudioParse *mp3parse = GST_MPEG_AUDIO_PARSE (parse); + + res = + GST_BASE_PARSE_CLASS (parent_class)->sink_event (parse, + gst_event_ref (event)); + if (!res) { + gst_event_unref (event); + return FALSE; + } + + switch (GST_EVENT_TYPE (event)) { + case GST_EVENT_SEGMENT:{ + const GstSegment *segment; + + gst_event_parse_segment (event, &segment); + mp3parse->upstream_format = segment->format; + } + default: + break; + } + + gst_event_unref (event); + + return res; +} + static gboolean gst_mpeg_audio_parse_convert (GstBaseParse * parse, GstFormat src_format, gint64 src_value, GstFormat dest_format, gint64 * dest_value) @@ -1418,6 +1745,179 @@ gst_mpeg_audio_parse_pre_push_frame (GstBaseParse * parse, gst_tag_list_unref (taglist); } + /* adjust buffer PTS/DTS/durations according to gapless playback info */ + if ((mp3parse->upstream_format == GST_FORMAT_BYTES + || GST_PAD_MODE (GST_BASE_PARSE_SINK_PAD (parse)) == + GST_PAD_MODE_PULL) + && GST_CLOCK_TIME_IS_VALID (mp3parse->total_padding_time)) { + guint64 frame_nr; + GstClockTime pts, dts; + gboolean add_clipping_meta = FALSE; + guint32 start_clip = 0, end_clip = 0; + GstClockTime timestamp_decrement; + guint64 sample_pos; + guint64 sample_pos_end; + + /* Get the number of the current frame so we can determine where we + * currently are in the MPEG stream. + * + * Gapless playback is best done based on samples, not timestamps, + * to avoid potential rounding errors that can otherwise cause a few + * samples to be incorrectly clipped or not clipped. + * + * TODO: At the moment, there is no dedicated baseparse API for finding + * out what frame we are currently in. The frame number is calculated + * out of the PTS of the current frame. Each frame has the same duration, + * and at this point, the buffer's PTS has not been adjusted to exclude + * the padding samples, so the PTS will be an integer multiple of + * frame_duration. However, this is not an ideal solution. Investigate + * how to properly implement this. */ + frame_nr = GST_BUFFER_PTS (frame->buffer) / mp3parse->frame_duration; + GST_LOG_OBJECT (mp3parse, "Handling MP3 frame #%" G_GUINT64_FORMAT, + frame_nr); + + /* By default, we subtract the start_padding_time from the timestamps. + * start_padding_time specifies the duration of the padding samples + * at the beginning of the MPEG stream. To factor out these padding + * samples, we have to shift the timestamps back, which is done with + * this decrement. */ + timestamp_decrement = mp3parse->start_padding_time; + + pts = GST_BUFFER_PTS (frame->buffer); + dts = GST_BUFFER_DTS (frame->buffer); + + /* sample_pos specifies the current position of the beginning of the + * current frame, while sample_pos_end specifies the current position + * of 1 samples past the end of the current frame. Both values are + * in samples. */ + sample_pos = frame_nr * mp3parse->spf; + sample_pos_end = sample_pos + mp3parse->spf; + + /* Check if the frame is not (fully) within the actual playback range. */ + if (G_UNLIKELY (sample_pos <= mp3parse->start_of_actual_samples || + (sample_pos_end >= mp3parse->end_of_actual_samples))) { + + if (G_UNLIKELY (frame_nr >= mp3parse->xing_frames)) { + /* Test #1: Check if the current position lies past the length + * that is specified by the Xing frame header. This normally does + * not happen, but does occur with "Frankenstein" streams (see + * the explanation at the beginning of this source file for more). + * Do this first, since the other test may yield false positives + * in this case. */ + GST_LOG_OBJECT (mp3parse, "There are frames beyond what the Xing " + "metadata indicates; this is a Frankenstein stream!"); + + /* The frames past the "officially" last one (= the last one according + * to the Xing header frame) are located past the padding samples + * that follow the actual playback range. The length of these + * padding samples in nanoseconds is stored in end_padding_time. + * We need to shift the PTS to compensate for these padding samples, + * otherwise there would be a timestamp discontinuity between the + * last "official" frame and the first "Frankenstein" frame. */ + timestamp_decrement += mp3parse->end_padding_time; + } else if (sample_pos_end <= mp3parse->start_of_actual_samples) { + /* Test #2: Check if the frame lies completely before the actual + * playback range. This happens if the number of padding samples + * at the start of the stream exceeds the size of a frame, meaning + * that the entire frame will be filled with padding samples. + * This has not been observed so far. However, it is in theory + * possible, so handle it here. */ + + /* We want to clip all samples in the frame. Since this is a frame + * at the start of the stream, set start_clip to the frame size. + * Also set the buffer duration to 0 to make sure baseparse does not + * increment timestamps after this current frame is finished. */ + start_clip = mp3parse->spf; + GST_BUFFER_DURATION (frame->buffer) = 0; + + add_clipping_meta = TRUE; + } else if (sample_pos <= mp3parse->start_of_actual_samples) { + /* Test #3: Check if a portion of the frame lies before the actual + * playback range. Set the duration to the number of samples that + * remain after clipping. */ + + start_clip = mp3parse->start_of_actual_samples - sample_pos; + GST_BUFFER_DURATION (frame->buffer) = + gst_util_uint64_scale_int (sample_pos_end - + mp3parse->start_of_actual_samples, GST_SECOND, mp3parse->rate); + + add_clipping_meta = TRUE; + } else if (sample_pos >= mp3parse->end_of_actual_samples) { + /* Test #4: Check if the frame lies completely after the actual + * playback range. Similar to test #2, this happens if the number + * of padding samples at the end of the stream exceeds the size of + * a frame, meaning that the entire frame will be filled with padding + * samples. Unlike test #2, this has been observed in mp3s several + * times: The penultimate frame is partially clipped, the final + * frame is fully clipped. */ + + GstClockTime padding_ns; + + /* We want to clip all samples in the frame. Since this is a frame + * at the end of the stream, set end_clip to the frame size. + * Also set the buffer duration to 0 to make sure baseparse does not + * increment timestamps after this current frame is finished. */ + end_clip = mp3parse->spf; + GST_BUFFER_DURATION (frame->buffer) = 0; + + /* Even though this frame will be fully clipped, we still have to + * make sure its timestamps are not discontinuous with the preceding + * ones. To that end, it is necessary to subtract the time range + * between the current position and the last valid playback range + * position from the PTS and DTS. */ + padding_ns = gst_util_uint64_scale_int (sample_pos - + mp3parse->end_of_actual_samples, GST_SECOND, mp3parse->rate); + timestamp_decrement += padding_ns; + + add_clipping_meta = TRUE; + } else if (sample_pos_end >= mp3parse->end_of_actual_samples) { + /* Test #5: Check if a portion of the frame lies after the actual + * playback range. Set the duration to the number of samples that + * remain after clipping. */ + + end_clip = sample_pos_end - mp3parse->end_of_actual_samples; + GST_BUFFER_DURATION (frame->buffer) = + gst_util_uint64_scale_int (mp3parse->end_of_actual_samples - + sample_pos, GST_SECOND, mp3parse->rate); + + add_clipping_meta = TRUE; + } + } + + if (G_UNLIKELY (add_clipping_meta)) { + GST_DEBUG_OBJECT (mp3parse, "Adding clipping meta: start %" + G_GUINT32_FORMAT " end %" G_GUINT32_FORMAT, start_clip, end_clip); + gst_buffer_add_audio_clipping_meta (frame->buffer, GST_FORMAT_DEFAULT, + start_clip, end_clip); + } + + /* Adjust the timestamps by subtracting from them. The decrement + * is computed above. */ + GST_BUFFER_PTS (frame->buffer) = (pts >= timestamp_decrement) ? (pts - + timestamp_decrement) : 0; + GST_BUFFER_DTS (frame->buffer) = (dts >= timestamp_decrement) ? (dts - + timestamp_decrement) : 0; + + /* NOTE: We do not adjust the size here, just the timestamps and duration. + * We also do not drop fully clipped frames. This is because downstream + * MPEG audio decoders still need the data of the frame, even if it gets + * fully clipped later. They do need these frames for their decoding process. + * If these frames were dropped, the decoders would not fully decode all + * of the data from the MPEG stream. */ + + /* TODO: Should offset/offset_end also be adjusted? */ + } + + /* Check if this frame can safely be dropped (for example, because it is an + * empty Xing header frame). */ + if (G_UNLIKELY (mp3parse->outgoing_frame_is_xing_header)) { + GST_DEBUG_OBJECT (mp3parse, "Marking frame as decode-only / droppable"); + mp3parse->outgoing_frame_is_xing_header = FALSE; + GST_BUFFER_DURATION (frame->buffer) = 0; + GST_BUFFER_FLAG_SET (frame->buffer, GST_BUFFER_FLAG_DECODE_ONLY); + GST_BUFFER_FLAG_SET (frame->buffer, GST_BUFFER_FLAG_DROPPABLE); + } + /* usual clipping applies */ frame->flags |= GST_BASE_PARSE_FRAME_FLAG_CLIP; diff --git a/subprojects/gst-plugins-good/gst/audioparsers/gstmpegaudioparse.h b/subprojects/gst-plugins-good/gst/audioparsers/gstmpegaudioparse.h index e7fa8099fb..c77663c6b7 100644 --- a/subprojects/gst-plugins-good/gst/audioparsers/gstmpegaudioparse.h +++ b/subprojects/gst-plugins-good/gst/audioparsers/gstmpegaudioparse.h @@ -51,14 +51,19 @@ struct _GstMpegAudioParse { GstBaseParse baseparse; /*< private >*/ + GstFormat upstream_format; + gint rate; gint channels; gint layer; gint version; GstClockTime max_bitreservoir; - /* samples per frame */ - gint spf; + + /* Samples per frame */ + gint spf; + + GstClockTime frame_duration; gint freerate; @@ -67,6 +72,8 @@ struct _GstMpegAudioParse { gint last_posted_crc, last_crc; guint last_posted_channel_mode, last_mode; + gboolean outgoing_frame_is_xing_header; + /* Bitrate from non-vbr headers */ guint32 hdr_bitrate; gboolean bitrate_is_constant; @@ -75,6 +82,7 @@ struct _GstMpegAudioParse { guint32 xing_flags; guint32 xing_frames; GstClockTime xing_total_time; + GstClockTime xing_actual_total_time; guint32 xing_bytes; /* percent -> filepos mapping */ guchar xing_seek_table[100]; @@ -95,6 +103,14 @@ struct _GstMpegAudioParse { /* LAME info */ guint32 encoder_delay; guint32 encoder_padding; + + /* Gapless playback states */ + guint32 decoder_delay; + guint64 start_of_actual_samples; + guint64 end_of_actual_samples; + GstClockTime start_padding_time; + GstClockTime end_padding_time; + GstClockTime total_padding_time; }; /** diff --git a/subprojects/gst-plugins-good/tests/check/elements/mpegaudioparse.c b/subprojects/gst-plugins-good/tests/check/elements/mpegaudioparse.c index c144e95fac..db2075a450 100644 --- a/subprojects/gst-plugins-good/tests/check/elements/mpegaudioparse.c +++ b/subprojects/gst-plugins-good/tests/check/elements/mpegaudioparse.c @@ -24,6 +24,8 @@ */ #include +#include +#include #include "parser.h" #define SRC_CAPS_TMPL "audio/mpeg, parsed=(boolean)false, mpegversion=(int)1" @@ -123,6 +125,294 @@ GST_START_TEST (test_parse_detect_stream) GST_END_TEST; +/* Gapless tests are performed using a test signal that contains 30 MPEG + * frames, has padding samples at the beginning and at the end, a LAME + * tag to inform about said padding samples, and a sample rate of 32 kHz + * and 1 channel. The test signal is 1009ms long. setup_gapless_test_info() + * fills the GaplessTestInfo struct with details about this test signal. */ + +typedef struct +{ + const gchar *filename; + guint num_mpeg_frames; + guint num_samples_per_frame; + guint num_start_padding_samples; + guint num_end_padding_samples; + guint sample_rate; + + guint first_padded_end_frame; + guint64 num_samples_with_padding; + guint64 num_samples_without_padding; + + GstClockTime first_frame_duration; + GstClockTime regular_frame_duration; + GstClockTime total_duration_without_padding; + + GstElement *appsink; + GstElement *parser; +} GaplessTestInfo; + +static void +setup_gapless_test_info (GaplessTestInfo * info) +{ + info->filename = "sine-1009ms-1ch-32000hz-gapless-with-lame-tag.mp3"; + info->num_mpeg_frames = 31; + info->num_samples_per_frame = 1152; /* standard for MP3s */ + info->sample_rate = 32000; + + /* Note that these start and end padding figures are not exactly like + * those that we get from the LAME tag. That's because that tag only + * contains the _encoder_ delay & padding. In the figures below, the + * _decoder_ delay is also factored in (529 samples). mpegaudioparse + * does the same, so we have to apply it here. */ + info->num_start_padding_samples = 1105; + info->num_end_padding_samples = 1167; + + /* In MP3s with LAME tags, the first frame is a frame made of Xing/LAME + * metadata and dummy nullsamples (this is for backwards compatibility). + * num_start_padding_samples defines how many padding samples are there + * (this does not include the nullsamples from the first dummy frame). + * Likewise, num_end_padding_samples defines how many padding samples + * are there at the end of the MP3 stream. + * There may be more padding samples than the size of one frame, meaning + * that there may be frames that are made entirely of padding samples. + * Such frames are output by mpegaudioparse, but their duration is set + * to 0, and their PTS corresponds to the last valid PTS in the stream + * (= the last PTS that is within the actual media data). + * For this reason, we cannot just assume that the last frame is the + * one containing padding - there may be more. So, calculate the number + * of the first frame that contains padding sames from the _end_ of + * the stream. We'll need that later for buffer PTS and duration checks. */ + info->first_padded_end_frame = (info->num_mpeg_frames - 1 - + info->num_end_padding_samples / info->num_samples_per_frame); + info->num_samples_with_padding = (info->num_mpeg_frames - 1) * + info->num_samples_per_frame; + info->num_samples_without_padding = info->num_samples_with_padding - + info->num_start_padding_samples - info->num_end_padding_samples; + + /* The first frame (excluding the dummy frame at the beginning) will be + * clipped due to the padding samples at the start of the stream, so we + * have to calculate this separately. */ + info->first_frame_duration = + gst_util_uint64_scale_int (info->num_samples_per_frame - + info->num_start_padding_samples, GST_SECOND, info->sample_rate); + /* Regular, unclipped MPEG frame duration. */ + info->regular_frame_duration = + gst_util_uint64_scale_int (info->num_samples_per_frame, GST_SECOND, + info->sample_rate); + /* The total actual playtime duration. */ + info->total_duration_without_padding = + gst_util_uint64_scale_int (info->num_samples_without_padding, GST_SECOND, + info->sample_rate); +} + +static void +check_parsed_mpeg_frame (GaplessTestInfo * info, guint frame_num) +{ + GstClockTime expected_pts = GST_CLOCK_TIME_NONE; + GstClockTime expected_duration = GST_CLOCK_TIME_NONE; + gboolean expect_audioclipmeta = FALSE; + guint64 expected_audioclipmeta_start = 0; + guint64 expected_audioclipmeta_end = 0; + GstSample *sample; + GstBuffer *buffer; + GstAudioClippingMeta *audioclip_meta; + + GST_DEBUG ("checking frame %u", frame_num); + + /* This is called after the frame with the given number has been output by + * mpegaudioparse. We can then pull that frame from appsink, and check its + * PTS, duration, and audioclipmeta (if we expect it to be there). */ + + if (frame_num == 0) { + expected_pts = 0; + expected_duration = 0; + expect_audioclipmeta = FALSE; + } else if (frame_num == 1) { + /* First frame (excluding the dummy metadata frame at the beginning of + * the MPEG stream that mpegaudioparse internally drops). This one will be + * clipped due to the padding samples at the beginning, so we expect a + * clipping meta to be there. Also, its duration will be smaller than that + * of regular, unclipped frames. */ + + expected_pts = 0; + expected_duration = info->first_frame_duration; + + expect_audioclipmeta = TRUE; + expected_audioclipmeta_start = info->num_start_padding_samples; + expected_audioclipmeta_end = 0; + } else if (frame_num > 1 && frame_num < info->first_padded_end_frame) { + /* Regular, unclipped frame. */ + + expected_pts = info->first_frame_duration + (frame_num - 2) * + info->regular_frame_duration; + expected_duration = info->regular_frame_duration; + } else if (frame_num == info->first_padded_end_frame) { + /* The first frame at the end with padding samples. This one will have + * the last few valid samples, followed by the first padding samples. */ + + guint64 num_valid_samples = (info->num_samples_with_padding - + info->num_end_padding_samples) - (frame_num - 1) * + info->num_samples_per_frame; + guint64 num_padding_samples = info->num_samples_per_frame - + num_valid_samples; + + expected_pts = info->first_frame_duration + (frame_num - 2) * + info->regular_frame_duration; + expected_duration = gst_util_uint64_scale_int (num_valid_samples, + GST_SECOND, info->sample_rate); + + expect_audioclipmeta = TRUE; + expected_audioclipmeta_start = 0; + expected_audioclipmeta_end = num_padding_samples; + } else { + /* A fully clipped frame at the end of the stream. */ + + expected_pts = info->total_duration_without_padding; + expected_duration = 0; + + expect_audioclipmeta = TRUE; + expected_audioclipmeta_start = 0; + expected_audioclipmeta_end = info->num_samples_per_frame; + } + + /* Pull the frame from appsink so we can check it. */ + + sample = gst_app_sink_pull_sample (GST_APP_SINK (info->appsink)); + fail_if (sample == NULL); + fail_unless (GST_IS_SAMPLE (sample)); + + buffer = gst_sample_get_buffer (sample); + fail_if (buffer == NULL); + + /* Verify the sample's PTS and duration. */ + fail_unless_equals_uint64 (GST_BUFFER_PTS (buffer), expected_pts); + fail_unless_equals_uint64 (GST_BUFFER_DURATION (buffer), expected_duration); + /* Check if there's audio clip metadata, and verify it if it exists. */ + if (expect_audioclipmeta) { + audioclip_meta = gst_buffer_get_audio_clipping_meta (buffer); + fail_if (audioclip_meta == NULL); + fail_unless_equals_uint64 (audioclip_meta->start, + expected_audioclipmeta_start); + fail_unless_equals_uint64 (audioclip_meta->end, expected_audioclipmeta_end); + } + + gst_sample_unref (sample); +} + +GST_START_TEST (test_parse_gapless_and_skip_padding_samples) +{ + GstElement *source, *parser, *appsink, *pipeline; + GstStateChangeReturn state_ret; + guint frame_num; + GaplessTestInfo info; + + setup_gapless_test_info (&info); + + pipeline = gst_pipeline_new (NULL); + source = gst_element_factory_make ("filesrc", NULL); + parser = gst_element_factory_make ("mpegaudioparse", NULL); + appsink = gst_element_factory_make ("appsink", NULL); + + info.appsink = appsink; + info.parser = parser; + + gst_bin_add_many (GST_BIN (pipeline), source, parser, appsink, NULL); + gst_element_link_many (source, parser, appsink, NULL); + + { + char *full_filename = + g_build_filename (GST_TEST_FILES_PATH, info.filename, NULL); + g_object_set (G_OBJECT (source), "location", full_filename, NULL); + g_free (full_filename); + } + + g_object_set (G_OBJECT (appsink), "async", FALSE, "sync", FALSE, + "max-buffers", 1, "enable-last-sample", FALSE, "processing-deadline", + G_MAXUINT64, NULL); + + state_ret = gst_element_set_state (pipeline, GST_STATE_PLAYING); + + fail_unless (state_ret != GST_STATE_CHANGE_FAILURE); + + if (state_ret == GST_STATE_CHANGE_ASYNC) { + GST_LOG ("waiting for pipeline to reach PAUSED state"); + state_ret = gst_element_get_state (pipeline, NULL, NULL, -1); + fail_unless_equals_int (state_ret, GST_STATE_CHANGE_SUCCESS); + } + + /* Verify all frames from the test signal. */ + for (frame_num = 0; frame_num < info.num_mpeg_frames; ++frame_num) + check_parsed_mpeg_frame (&info, frame_num); + + /* Check what duration is returned by a query. This duration must exclude + * the padding samples. */ + { + GstQuery *query; + gint64 duration; + GstFormat format; + + query = gst_query_new_duration (GST_FORMAT_TIME); + fail_unless (gst_element_query (pipeline, query)); + + gst_query_parse_duration (query, &format, &duration); + fail_unless_equals_int (format, GST_FORMAT_TIME); + fail_unless_equals_uint64 ((guint64) duration, + info.total_duration_without_padding); + + gst_query_unref (query); + } + + /* Seek tests: Here we seek to a certain position that corresponds to a + * certain frame. Then we check if we indeed got that frame. */ + + /* Seek back to the first frame. */ + { + fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PAUSED), + GST_STATE_CHANGE_SUCCESS); + gst_element_seek_simple (pipeline, GST_FORMAT_TIME, GST_SEEK_FLAG_FLUSH | + GST_SEEK_FLAG_KEY_UNIT, 0); + fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PLAYING), + GST_STATE_CHANGE_SUCCESS); + + check_parsed_mpeg_frame (&info, 1); + } + + /* Seek to the second frame. */ + { + fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PAUSED), + GST_STATE_CHANGE_SUCCESS); + gst_element_seek_simple (pipeline, GST_FORMAT_TIME, GST_SEEK_FLAG_FLUSH | + GST_SEEK_FLAG_KEY_UNIT, info.first_frame_duration); + fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PLAYING), + GST_STATE_CHANGE_SUCCESS); + + check_parsed_mpeg_frame (&info, 2); + } + + /* Seek to the last frame with valid samples (= the first frame with padding + * samples at the end of the stream). */ + { + GstClockTime pts = info.first_frame_duration + + (info.first_padded_end_frame - 2) * info.regular_frame_duration; + + fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PAUSED), + GST_STATE_CHANGE_SUCCESS); + gst_element_seek_simple (pipeline, GST_FORMAT_TIME, GST_SEEK_FLAG_FLUSH | + GST_SEEK_FLAG_KEY_UNIT, pts); + fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PLAYING), + GST_STATE_CHANGE_SUCCESS); + + check_parsed_mpeg_frame (&info, info.first_padded_end_frame); + } + + gst_element_set_state (pipeline, GST_STATE_NULL); + gst_object_unref (pipeline); +} + +GST_END_TEST; + + static Suite * mpegaudioparse_suite (void) { @@ -142,6 +432,7 @@ mpegaudioparse_suite (void) tcase_add_test (tc_chain, test_parse_split); tcase_add_test (tc_chain, test_parse_skip_garbage); tcase_add_test (tc_chain, test_parse_detect_stream); + tcase_add_test (tc_chain, test_parse_gapless_and_skip_padding_samples); return s; } diff --git a/tests/files/sine-1009ms-1ch-32000hz-gapless-with-lame-tag.mp3 b/tests/files/sine-1009ms-1ch-32000hz-gapless-with-lame-tag.mp3 new file mode 100644 index 0000000000..b43c4f405e Binary files /dev/null and b/tests/files/sine-1009ms-1ch-32000hz-gapless-with-lame-tag.mp3 differ