gst-plugins-rs/mux/mp4/src/mp4mux/boxes.rs

// Copyright (C) 2022 Sebastian Dröge <sebastian@centricular.com>
//
// This Source Code Form is subject to the terms of the Mozilla Public License, v2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at
// <https://mozilla.org/MPL/2.0/>.
//
// SPDX-License-Identifier: MPL-2.0

use gst::prelude::*;

use anyhow::{anyhow, bail, Context, Error};

use std::str::FromStr;

fn write_box<T, F: FnOnce(&mut Vec<u8>) -> Result<T, Error>>(
    vec: &mut Vec<u8>,
    fourcc: impl std::borrow::Borrow<[u8; 4]>,
    content_func: F,
) -> Result<T, Error> {
    // Write zero size ...
    let size_pos = vec.len();
    vec.extend([0u8; 4]);
    vec.extend(fourcc.borrow());

    let res = content_func(vec)?;

    // ... and update it here later.
    let size: u32 = vec
        .len()
        .checked_sub(size_pos)
        .expect("vector shrunk")
        .try_into()
        .context("too big box content")?;
    vec[size_pos..][..4].copy_from_slice(&size.to_be_bytes());

    Ok(res)
}

const FULL_BOX_VERSION_0: u8 = 0;
const FULL_BOX_VERSION_1: u8 = 1;

const FULL_BOX_FLAGS_NONE: u32 = 0;

fn write_full_box<T, F: FnOnce(&mut Vec<u8>) -> Result<T, Error>>(
    vec: &mut Vec<u8>,
    fourcc: impl std::borrow::Borrow<[u8; 4]>,
    version: u8,
    flags: u32,
    content_func: F,
) -> Result<T, Error> {
    write_box(vec, fourcc, move |vec| {
        assert_eq!(flags >> 24, 0);
        vec.extend(((u32::from(version) << 24) | flags).to_be_bytes());
        content_func(vec)
    })
}

/// Creates `ftyp` box
pub(super) fn create_ftyp(variant: super::Variant) -> Result<gst::Buffer, Error> {
    let mut v = vec![];

    let (brand, compatible_brands) = match variant {
        super::Variant::ISO => (b"isom", vec![b"mp41", b"mp42"]),
    };

    write_box(&mut v, b"ftyp", |v| {
        // major brand
        v.extend(brand);
        // minor version
        v.extend(0u32.to_be_bytes());
        // compatible brands
        v.extend(compatible_brands.into_iter().flatten());

        Ok(())
    })?;

    Ok(gst::Buffer::from_mut_slice(v))
}

/// Creates `mdat` box *header*.
pub(super) fn create_mdat_header(size: Option<u64>) -> Result<gst::Buffer, Error> {
    let mut v = vec![];

    if let Some(size) = size {
        if let Ok(size) = u32::try_from(size + 8) {
            v.extend(8u32.to_be_bytes());
            v.extend(b"free");
            v.extend(size.to_be_bytes());
            v.extend(b"mdat");
        } else {
            v.extend(1u32.to_be_bytes());
            v.extend(b"mdat");
            v.extend((size + 16).to_be_bytes());
        }
    } else {
        v.extend(8u32.to_be_bytes());
        v.extend(b"free");
        v.extend(0u32.to_be_bytes());
        v.extend(b"mdat");
    }

    Ok(gst::Buffer::from_mut_slice(v))
}

/// Creates `moov` box
pub(super) fn create_moov(header: super::Header) -> Result<gst::Buffer, Error> {
    let mut v = vec![];

    write_box(&mut v, b"moov", |v| write_moov(v, &header))?;

    Ok(gst::Buffer::from_mut_slice(v))
}

fn write_moov(v: &mut Vec<u8>, header: &super::Header) -> Result<(), Error> {
    use gst::glib;

    let base = glib::DateTime::from_utc(1904, 1, 1, 0, 0, 0.0)?;
    let now = glib::DateTime::now_utc()?;
    let creation_time =
        u64::try_from(now.difference(&base).as_seconds()).expect("time before 1904");

    write_full_box(v, b"mvhd", FULL_BOX_VERSION_1, FULL_BOX_FLAGS_NONE, |v| {
        write_mvhd(v, header, creation_time)
    })?;
    for (idx, stream) in header.streams.iter().enumerate() {
        write_box(v, b"trak", |v| {
            write_trak(v, header, idx, stream, creation_time)
        })?;
    }

    Ok(())
}

fn stream_to_timescale(stream: &super::Stream) -> u32 {
    if stream.trak_timescale > 0 {
        stream.trak_timescale
    } else {
        let s = stream.caps.structure(0).unwrap();

        if let Ok(fps) = s.get::<gst::Fraction>("framerate") {
            if fps.numer() == 0 {
                return 10_000;
            }

            if fps.denom() != 1 && fps.denom() != 1001 {
                if let Some(fps) = (fps.denom() as u64)
                    .nseconds()
                    .mul_div_round(1_000_000_000, fps.numer() as u64)
                    .and_then(gst_video::guess_framerate)
                {
                    return (fps.numer() as u32)
                        .mul_div_round(100, fps.denom() as u32)
                        .unwrap_or(10_000);
                }
            }

            (fps.numer() as u32)
                .mul_div_round(100, fps.denom() as u32)
                .unwrap_or(10_000)
        } else if let Ok(rate) = s.get::<i32>("rate") {
            rate as u32
        } else {
            10_000
        }
    }
}

fn header_to_timescale(header: &super::Header) -> u32 {
    if header.movie_timescale > 0 {
        header.movie_timescale
    } else {
        // Use the reference track timescale
        stream_to_timescale(&header.streams[0])
    }
}

fn write_mvhd(v: &mut Vec<u8>, header: &super::Header, creation_time: u64) -> Result<(), Error> {
    let timescale = header_to_timescale(header);

    // Creation time
    v.extend(creation_time.to_be_bytes());
    // Modification time
    v.extend(creation_time.to_be_bytes());
    // Timescale
    v.extend(timescale.to_be_bytes());
    // Duration
    let min_earliest_pts = header.streams.iter().map(|s| s.earliest_pts).min().unwrap();
    let max_end_pts = header
        .streams
        .iter()
        .map(|stream| stream.end_pts)
        .max()
        .unwrap();
    let duration = (max_end_pts - min_earliest_pts)
        .nseconds()
        .mul_div_round(timescale as u64, gst::ClockTime::SECOND.nseconds())
        .context("too big track duration")?;
    v.extend(duration.to_be_bytes());

    // Rate 1.0
    v.extend((1u32 << 16).to_be_bytes());
    // Volume 1.0
    v.extend((1u16 << 8).to_be_bytes());
    // Reserved
    v.extend([0u8; 2 + 2 * 4]);

    // Matrix
    v.extend(
        [
            (1u32 << 16).to_be_bytes(),
            0u32.to_be_bytes(),
            0u32.to_be_bytes(),
            0u32.to_be_bytes(),
            (1u32 << 16).to_be_bytes(),
            0u32.to_be_bytes(),
            0u32.to_be_bytes(),
            0u32.to_be_bytes(),
            (16384u32 << 16).to_be_bytes(),
        ]
        .into_iter()
        .flatten(),
    );

    // Pre defined
    v.extend([0u8; 6 * 4]);

    // Next track id
    v.extend((header.streams.len() as u32 + 1).to_be_bytes());

    Ok(())
}

const TKHD_FLAGS_TRACK_ENABLED: u32 = 0x1;
const TKHD_FLAGS_TRACK_IN_MOVIE: u32 = 0x2;
const TKHD_FLAGS_TRACK_IN_PREVIEW: u32 = 0x4;

fn write_trak(
    v: &mut Vec<u8>,
    header: &super::Header,
    idx: usize,
    stream: &super::Stream,
    creation_time: u64,
) -> Result<(), Error> {
    write_full_box(
        v,
        b"tkhd",
        FULL_BOX_VERSION_1,
        TKHD_FLAGS_TRACK_ENABLED | TKHD_FLAGS_TRACK_IN_MOVIE | TKHD_FLAGS_TRACK_IN_PREVIEW,
        |v| write_tkhd(v, header, idx, stream, creation_time),
    )?;

    write_box(v, b"mdia", |v| write_mdia(v, header, stream, creation_time))?;
    write_box(v, b"edts", |v| write_edts(v, header, stream))?;

    Ok(())
}

fn write_tkhd(
    v: &mut Vec<u8>,
    header: &super::Header,
    idx: usize,
    stream: &super::Stream,
    creation_time: u64,
) -> Result<(), Error> {
    // Creation time
    v.extend(creation_time.to_be_bytes());
    // Modification time
    v.extend(creation_time.to_be_bytes());
    // Track ID
    v.extend((idx as u32 + 1).to_be_bytes());
    // Reserved
    v.extend(0u32.to_be_bytes());
    // Duration

    // Track header duration is in movie header timescale
    let timescale = header_to_timescale(header);

    let min_earliest_pts = header.streams.iter().map(|s| s.earliest_pts).min().unwrap();
    // Duration is the end PTS of this stream up to the beginning of the earliest stream
    let duration = stream.end_pts - min_earliest_pts;
    let duration = duration
        .nseconds()
        .mul_div_round(timescale as u64, gst::ClockTime::SECOND.nseconds())
        .context("too big track duration")?;
    v.extend(duration.to_be_bytes());

    // Reserved
    v.extend([0u8; 2 * 4]);

    // Layer
    v.extend(0u16.to_be_bytes());
    // Alternate group
    v.extend(0u16.to_be_bytes());

    // Volume
    let s = stream.caps.structure(0).unwrap();
    match s.name() {
        "audio/mpeg" | "audio/x-opus" | "audio/x-alaw" | "audio/x-mulaw" | "audio/x-adpcm" => {
            v.extend((1u16 << 8).to_be_bytes())
        }
        _ => v.extend(0u16.to_be_bytes()),
    }

    // Reserved
    v.extend([0u8; 2]);

    // Matrix
    v.extend(
        [
            (1u32 << 16).to_be_bytes(),
            0u32.to_be_bytes(),
            0u32.to_be_bytes(),
            0u32.to_be_bytes(),
            (1u32 << 16).to_be_bytes(),
            0u32.to_be_bytes(),
            0u32.to_be_bytes(),
            0u32.to_be_bytes(),
            (16384u32 << 16).to_be_bytes(),
        ]
        .into_iter()
        .flatten(),
    );

    // Width/height
    match s.name() {
        "video/x-h264" | "video/x-h265" | "video/x-vp9" | "image/jpeg" => {
            let width = s.get::<i32>("width").context("video caps without width")? as u32;
            let height = s
                .get::<i32>("height")
                .context("video caps without height")? as u32;
            let par = s
                .get::<gst::Fraction>("pixel-aspect-ratio")
                .unwrap_or_else(|_| gst::Fraction::new(1, 1));

            let width = std::cmp::min(
                width
                    .mul_div_round(par.numer() as u32, par.denom() as u32)
                    .unwrap_or(u16::MAX as u32),
                u16::MAX as u32,
            );
            let height = std::cmp::min(height, u16::MAX as u32);

            v.extend((width << 16).to_be_bytes());
            v.extend((height << 16).to_be_bytes());
        }
        _ => v.extend([0u8; 2 * 4]),
    }

    Ok(())
}

fn write_mdia(
    v: &mut Vec<u8>,
    header: &super::Header,
    stream: &super::Stream,
    creation_time: u64,
) -> Result<(), Error> {
    write_full_box(v, b"mdhd", FULL_BOX_VERSION_1, FULL_BOX_FLAGS_NONE, |v| {
        write_mdhd(v, header, stream, creation_time)
    })?;
    write_full_box(v, b"hdlr", FULL_BOX_VERSION_0, FULL_BOX_FLAGS_NONE, |v| {
        write_hdlr(v, header, stream)
    })?;

    // TODO: write elng if needed

    write_box(v, b"minf", |v| write_minf(v, header, stream))?;

    Ok(())
}

fn language_code(lang: impl std::borrow::Borrow<[u8; 3]>) -> u16 {
    let lang = lang.borrow();

    // TODO: Need to relax this once we get the language code from tags
    assert!(lang.iter().all(u8::is_ascii_lowercase));

    (((lang[0] as u16 - 0x60) & 0x1F) << 10)
        + (((lang[1] as u16 - 0x60) & 0x1F) << 5)
        + ((lang[2] as u16 - 0x60) & 0x1F)
}

fn write_mdhd(
    v: &mut Vec<u8>,
    _header: &super::Header,
    stream: &super::Stream,
    creation_time: u64,
) -> Result<(), Error> {
    let timescale = stream_to_timescale(stream);

    // Creation time
    v.extend(creation_time.to_be_bytes());
    // Modification time
    v.extend(creation_time.to_be_bytes());
    // Timescale
    v.extend(timescale.to_be_bytes());
    // Duration
    let duration = stream
        .chunks
        .iter()
        .flat_map(|c| c.samples.iter().map(|b| b.duration.nseconds()))
        .sum::<u64>()
        .mul_div_round(timescale as u64, gst::ClockTime::SECOND.nseconds())
        .context("too big track duration")?;
    v.extend(duration.to_be_bytes());

    // Language as ISO-639-2/T
    // TODO: get actual language from the tags
    v.extend(language_code(b"und").to_be_bytes());

    // Pre-defined
    v.extend([0u8; 2]);

    Ok(())
}

fn write_hdlr(
    v: &mut Vec<u8>,
    _header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    // Pre-defined
    v.extend([0u8; 4]);

    let s = stream.caps.structure(0).unwrap();
    let (handler_type, name) = match s.name() {
        "video/x-h264" | "video/x-h265" | "video/x-vp9" | "image/jpeg" => {
            (b"vide", b"VideoHandler\0".as_slice())
        }
        "audio/mpeg" | "audio/x-opus" | "audio/x-alaw" | "audio/x-mulaw" | "audio/x-adpcm" => {
            (b"soun", b"SoundHandler\0".as_slice())
        }
        _ => unreachable!(),
    };

    // Handler type
    v.extend(handler_type);

    // Reserved
    v.extend([0u8; 3 * 4]);

    // Name
    v.extend(name);

    Ok(())
}

fn write_minf(
    v: &mut Vec<u8>,
    header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    let s = stream.caps.structure(0).unwrap();

    match s.name() {
        "video/x-h264" | "video/x-h265" | "video/x-vp9" | "image/jpeg" => {
            // Flags are always 1 for unspecified reasons
            write_full_box(v, b"vmhd", FULL_BOX_VERSION_0, 1, |v| write_vmhd(v, header))?
        }
        "audio/mpeg" | "audio/x-opus" | "audio/x-alaw" | "audio/x-mulaw" | "audio/x-adpcm" => {
            write_full_box(v, b"smhd", FULL_BOX_VERSION_0, FULL_BOX_FLAGS_NONE, |v| {
                write_smhd(v, header)
            })?
        }
        _ => unreachable!(),
    }

    write_box(v, b"dinf", |v| write_dinf(v, header))?;

    write_box(v, b"stbl", |v| write_stbl(v, header, stream))?;

    Ok(())
}

fn write_vmhd(v: &mut Vec<u8>, _header: &super::Header) -> Result<(), Error> {
    // Graphics mode
    v.extend([0u8; 2]);

    // opcolor
    v.extend([0u8; 2 * 3]);

    Ok(())
}

fn write_smhd(v: &mut Vec<u8>, _header: &super::Header) -> Result<(), Error> {
    // Balance
    v.extend([0u8; 2]);

    // Reserved
    v.extend([0u8; 2]);

    Ok(())
}

fn write_dinf(v: &mut Vec<u8>, header: &super::Header) -> Result<(), Error> {
    write_full_box(v, b"dref", FULL_BOX_VERSION_0, FULL_BOX_FLAGS_NONE, |v| {
        write_dref(v, header)
    })?;

    Ok(())
}

const DREF_FLAGS_MEDIA_IN_SAME_FILE: u32 = 0x1;

fn write_dref(v: &mut Vec<u8>, _header: &super::Header) -> Result<(), Error> {
    // Entry count
    v.extend(1u32.to_be_bytes());

    write_full_box(
        v,
        b"url ",
        FULL_BOX_VERSION_0,
        DREF_FLAGS_MEDIA_IN_SAME_FILE,
        |_v| Ok(()),
    )?;

    Ok(())
}

fn write_stbl(
    v: &mut Vec<u8>,
    header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    write_full_box(v, b"stsd", FULL_BOX_VERSION_0, FULL_BOX_FLAGS_NONE, |v| {
        write_stsd(v, header, stream)
    })?;
    write_full_box(v, b"stts", FULL_BOX_VERSION_0, FULL_BOX_FLAGS_NONE, |v| {
        write_stts(v, header, stream)
    })?;

    // If there are any composition time offsets we need to write the ctts box. If any are negative
    // we need to write version 1 of the box, otherwise version 0 is sufficient.
    let mut need_ctts = None;
    if stream.delta_frames.requires_dts() {
        for composition_time_offset in stream.chunks.iter().flat_map(|c| {
            c.samples.iter().map(|b| {
                b.composition_time_offset
                    .expect("not all samples have a composition time offset")
            })
        }) {
            if composition_time_offset < 0 {
                need_ctts = Some(1);
                break;
            } else {
                need_ctts = Some(0);
            }
        }
    }
    if let Some(need_ctts) = need_ctts {
        let version = if need_ctts == 0 {
            FULL_BOX_VERSION_0
        } else {
            FULL_BOX_VERSION_1
        };

        write_full_box(v, b"ctts", version, FULL_BOX_FLAGS_NONE, |v| {
            write_ctts(v, header, stream, version)
        })?;

        write_full_box(v, b"cslg", FULL_BOX_VERSION_1, FULL_BOX_FLAGS_NONE, |v| {
            write_cslg(v, header, stream)
        })?;
    }

    // If any sample is not a sync point, write the stss box
    if !stream.delta_frames.intra_only()
        && stream
            .chunks
            .iter()
            .flat_map(|c| c.samples.iter().map(|b| b.sync_point))
            .any(|sync_point| !sync_point)
    {
        write_full_box(v, b"stss", FULL_BOX_VERSION_0, FULL_BOX_FLAGS_NONE, |v| {
            write_stss(v, header, stream)
        })?;
    }

    write_full_box(v, b"stsz", FULL_BOX_VERSION_0, FULL_BOX_FLAGS_NONE, |v| {
        write_stsz(v, header, stream)
    })?;

    write_full_box(v, b"stsc", FULL_BOX_VERSION_0, FULL_BOX_FLAGS_NONE, |v| {
        write_stsc(v, header, stream)
    })?;

    if stream.chunks.last().unwrap().offset > u32::MAX as u64 {
        write_full_box(v, b"co64", FULL_BOX_VERSION_0, FULL_BOX_FLAGS_NONE, |v| {
            write_stco(v, header, stream, true)
        })?;
    } else {
        write_full_box(v, b"stco", FULL_BOX_VERSION_0, FULL_BOX_FLAGS_NONE, |v| {
            write_stco(v, header, stream, false)
        })?;
    }

    Ok(())
}

fn write_stsd(
    v: &mut Vec<u8>,
    header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    // Entry count
    v.extend(1u32.to_be_bytes());

    let s = stream.caps.structure(0).unwrap();
    match s.name() {
        "video/x-h264" | "video/x-h265" | "video/x-vp9" | "image/jpeg" => {
            write_visual_sample_entry(v, header, stream)?
        }
        "audio/mpeg" | "audio/x-opus" | "audio/x-alaw" | "audio/x-mulaw" | "audio/x-adpcm" => {
            write_audio_sample_entry(v, header, stream)?
        }
        _ => unreachable!(),
    }

    Ok(())
}

fn write_sample_entry_box<T, F: FnOnce(&mut Vec<u8>) -> Result<T, Error>>(
    v: &mut Vec<u8>,
    fourcc: impl std::borrow::Borrow<[u8; 4]>,
    content_func: F,
) -> Result<T, Error> {
    write_box(v, fourcc, move |v| {
        // Reserved
        v.extend([0u8; 6]);

        // Data reference index
        v.extend(1u16.to_be_bytes());

        content_func(v)
    })
}

fn write_visual_sample_entry(
    v: &mut Vec<u8>,
    _header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    let s = stream.caps.structure(0).unwrap();
    let fourcc = match s.name() {
        "video/x-h264" => {
            let stream_format = s.get::<&str>("stream-format").context("no stream-format")?;
            match stream_format {
                "avc" => b"avc1",
                "avc3" => b"avc3",
                _ => unreachable!(),
            }
        }
        "video/x-h265" => {
            let stream_format = s.get::<&str>("stream-format").context("no stream-format")?;
            match stream_format {
                "hvc1" => b"hvc1",
                "hev1" => b"hev1",
                _ => unreachable!(),
            }
        }
        "image/jpeg" => b"jpeg",
        "video/x-vp9" => b"vp09",
        _ => unreachable!(),
    };

    write_sample_entry_box(v, fourcc, move |v| {
        // pre-defined
        v.extend([0u8; 2]);
        // Reserved
        v.extend([0u8; 2]);
        // pre-defined
        v.extend([0u8; 3 * 4]);

        // Width
        let width =
            u16::try_from(s.get::<i32>("width").context("no width")?).context("too big width")?;
        v.extend(width.to_be_bytes());

        // Height
        let height = u16::try_from(s.get::<i32>("height").context("no height")?)
            .context("too big height")?;
        v.extend(height.to_be_bytes());

        // Horizontal resolution
        v.extend(0x00480000u32.to_be_bytes());

        // Vertical resolution
        v.extend(0x00480000u32.to_be_bytes());

        // Reserved
        v.extend([0u8; 4]);

        // Frame count
        v.extend(1u16.to_be_bytes());

        // Compressor name
        v.extend([0u8; 32]);

        // Depth
        v.extend(0x0018u16.to_be_bytes());

        // Pre-defined
        v.extend((-1i16).to_be_bytes());

        // Codec specific boxes
        match s.name() {
            "video/x-h264" => {
                let codec_data = s
                    .get::<&gst::BufferRef>("codec_data")
                    .context("no codec_data")?;
                let map = codec_data
                    .map_readable()
                    .context("codec_data not mappable")?;
                write_box(v, b"avcC", move |v| {
                    v.extend_from_slice(&map);
                    Ok(())
                })?;
            }
            "video/x-h265" => {
                let codec_data = s
                    .get::<&gst::BufferRef>("codec_data")
                    .context("no codec_data")?;
                let map = codec_data
                    .map_readable()
                    .context("codec_data not mappable")?;
                write_box(v, b"hvcC", move |v| {
                    v.extend_from_slice(&map);
                    Ok(())
                })?;
            }
            "video/x-vp9" => {
                let profile: u8 = match s.get::<&str>("profile").expect("no vp9 profile") {
                    "0" => Some(0),
                    "1" => Some(1),
                    "2" => Some(2),
                    "3" => Some(3),
                    _ => None,
                }
                .context("unsupported vp9 profile")?;
                let colorimetry = gst_video::VideoColorimetry::from_str(
                    s.get::<&str>("colorimetry").expect("no colorimetry"),
                )
                .context("failed to parse colorimetry")?;
                let video_full_range =
                    colorimetry.range() == gst_video::VideoColorRange::Range0_255;
                let chroma_format: u8 =
                    match s.get::<&str>("chroma-format").expect("no chroma-format") {
                        "4:2:0" =>
                        // chroma-site is optional
                        {
                            match s
                                .get::<&str>("chroma-site")
                                .ok()
                                .and_then(|cs| gst_video::VideoChromaSite::from_str(cs).ok())
                            {
                                Some(gst_video::VideoChromaSite::V_COSITED) => Some(0),
                                // COSITED
                                _ => Some(1),
                            }
                        }
                        "4:2:2" => Some(2),
                        "4:4:4" => Some(3),
                        _ => None,
                    }
                    .context("unsupported chroma-format")?;
                let bit_depth: u8 = {
                    let bit_depth_luma = s.get::<u32>("bit-depth-luma").expect("no bit-depth-luma");
                    let bit_depth_chroma = s
                        .get::<u32>("bit-depth-chroma")
                        .expect("no bit-depth-chroma");
                    if bit_depth_luma != bit_depth_chroma {
                        return Err(anyhow!("bit-depth-luma and bit-depth-chroma have different values which is an unsupported configuration"));
                    }
                    bit_depth_luma as u8
                };
                write_full_box(v, b"vpcC", 1, 0, move |v| {
                    v.push(profile);
                    // XXX: hardcoded level 1
                    v.push(10);
                    let mut byte: u8 = 0;
                    byte |= (bit_depth & 0xF) << 4;
                    byte |= (chroma_format & 0x7) << 1;
                    byte |= video_full_range as u8;
                    v.push(byte);
                    v.push(colorimetry.primaries().to_iso() as u8);
                    v.push(colorimetry.transfer().to_iso() as u8);
                    v.push(colorimetry.matrix().to_iso() as u8);
                    // 16-bit length field for codec initialization, unused
                    v.push(0);
                    v.push(0);
                    Ok(())
                })?;
            }
            "image/jpeg" => {
                // Nothing to do here
            }
            _ => unreachable!(),
        }

        if let Ok(par) = s.get::<gst::Fraction>("pixel-aspect-ratio") {
            write_box(v, b"pasp", move |v| {
                v.extend((par.numer() as u32).to_be_bytes());
                v.extend((par.denom() as u32).to_be_bytes());
                Ok(())
            })?;
        }

        if let Some(colorimetry) = s
            .get::<&str>("colorimetry")
            .ok()
            .and_then(|c| c.parse::<gst_video::VideoColorimetry>().ok())
        {
            write_box(v, b"colr", move |v| {
                v.extend(b"nclx");
                let (primaries, transfer, matrix) = {
                    #[cfg(feature = "v1_18")]
                    {
                        (
                            (colorimetry.primaries().to_iso() as u16),
                            (colorimetry.transfer().to_iso() as u16),
                            (colorimetry.matrix().to_iso() as u16),
                        )
                    }
                    #[cfg(not(feature = "v1_18"))]
                    {
                        let primaries = match colorimetry.primaries() {
                            gst_video::VideoColorPrimaries::Bt709 => 1u16,
                            gst_video::VideoColorPrimaries::Bt470m => 4u16,
                            gst_video::VideoColorPrimaries::Bt470bg => 5u16,
                            gst_video::VideoColorPrimaries::Smpte170m => 6u16,
                            gst_video::VideoColorPrimaries::Smpte240m => 7u16,
                            gst_video::VideoColorPrimaries::Film => 8u16,
                            gst_video::VideoColorPrimaries::Bt2020 => 9u16,
                            _ => 2,
                        };
                        let transfer = match colorimetry.transfer() {
                            gst_video::VideoTransferFunction::Bt709 => 1u16,
                            gst_video::VideoTransferFunction::Gamma22 => 4u16,
                            gst_video::VideoTransferFunction::Gamma28 => 5u16,
                            gst_video::VideoTransferFunction::Smpte240m => 7u16,
                            gst_video::VideoTransferFunction::Gamma10 => 8u16,
                            gst_video::VideoTransferFunction::Log100 => 9u16,
                            gst_video::VideoTransferFunction::Log316 => 10u16,
                            gst_video::VideoTransferFunction::Srgb => 13u16,
                            gst_video::VideoTransferFunction::Bt202012 => 15u16,
                            _ => 2,
                        };
                        let matrix = match colorimetry.matrix() {
                            gst_video::VideoColorMatrix::Rgb => 0u16,
                            gst_video::VideoColorMatrix::Bt709 => 1u16,
                            gst_video::VideoColorMatrix::Fcc => 4u16,
                            gst_video::VideoColorMatrix::Bt601 => 6u16,
                            gst_video::VideoColorMatrix::Smpte240m => 7u16,
                            gst_video::VideoColorMatrix::Bt2020 => 9u16,
                            _ => 2,
                        };

                        (primaries, transfer, matrix)
                    }
                };

                let full_range = match colorimetry.range() {
                    gst_video::VideoColorRange::Range0_255 => 0x80u8,
                    gst_video::VideoColorRange::Range16_235 => 0x00u8,
                    _ => 0x00,
                };

                v.extend(primaries.to_be_bytes());
                v.extend(transfer.to_be_bytes());
                v.extend(matrix.to_be_bytes());
                v.push(full_range);

                Ok(())
            })?;
        }

        #[cfg(feature = "v1_18")]
        {
            if let Ok(cll) = gst_video::VideoContentLightLevel::from_caps(&stream.caps) {
                write_box(v, b"clli", move |v| {
                    v.extend((cll.max_content_light_level() as u16).to_be_bytes());
                    v.extend((cll.max_frame_average_light_level() as u16).to_be_bytes());
                    Ok(())
                })?;
            }

            if let Ok(mastering) = gst_video::VideoMasteringDisplayInfo::from_caps(&stream.caps) {
                write_box(v, b"mdcv", move |v| {
                    for primary in mastering.display_primaries() {
                        v.extend(primary.x.to_be_bytes());
                        v.extend(primary.y.to_be_bytes());
                    }
                    v.extend(mastering.white_point().x.to_be_bytes());
                    v.extend(mastering.white_point().y.to_be_bytes());
                    v.extend(mastering.max_display_mastering_luminance().to_be_bytes());
                    v.extend(mastering.max_display_mastering_luminance().to_be_bytes());
                    Ok(())
                })?;
            }
        }

        // Write fiel box for codecs that require it
        if ["image/jpeg"].contains(&s.name()) {
            let interlace_mode = s
                .get::<&str>("interlace-mode")
                .ok()
                .map(gst_video::VideoInterlaceMode::from_string)
                .unwrap_or(gst_video::VideoInterlaceMode::Progressive);
            let field_order = s
                .get::<&str>("field-order")
                .ok()
                .map(gst_video::VideoFieldOrder::from_string)
                .unwrap_or(gst_video::VideoFieldOrder::Unknown);

            write_box(v, b"fiel", move |v| {
                let (interlace, field_order) = match interlace_mode {
                    gst_video::VideoInterlaceMode::Progressive => (1, 0),
                    gst_video::VideoInterlaceMode::Interleaved
                        if field_order == gst_video::VideoFieldOrder::TopFieldFirst =>
                    {
                        (2, 9)
                    }
                    gst_video::VideoInterlaceMode::Interleaved => (2, 14),
                    _ => (0, 0),
                };

                v.push(interlace);
                v.push(field_order);
                Ok(())
            })?;
        }

        // TODO: write btrt bitrate box based on tags

        Ok(())
    })?;

    Ok(())
}

fn write_audio_sample_entry(
    v: &mut Vec<u8>,
    _header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    let s = stream.caps.structure(0).unwrap();
    let fourcc = match s.name() {
        "audio/mpeg" => b"mp4a",
        "audio/x-opus" => b"Opus",
        "audio/x-alaw" => b"alaw",
        "audio/x-mulaw" => b"ulaw",
        "audio/x-adpcm" => {
            let layout = s.get::<&str>("layout").context("no ADPCM layout field")?;

            match layout {
                "g726" => b"ms\x00\x45",
                _ => unreachable!(),
            }
        }
        _ => unreachable!(),
    };

    let sample_size = match s.name() {
        "audio/x-adpcm" => {
            let bitrate = s.get::<i32>("bitrate").context("no ADPCM bitrate field")?;
            (bitrate / 8000) as u16
        }
        _ => 16u16,
    };

    write_sample_entry_box(v, fourcc, move |v| {
        // Reserved
        v.extend([0u8; 2 * 4]);

        // Channel count
        let channels = u16::try_from(s.get::<i32>("channels").context("no channels")?)
            .context("too many channels")?;
        v.extend(channels.to_be_bytes());

        // Sample size
        v.extend(sample_size.to_be_bytes());

        // Pre-defined
        v.extend([0u8; 2]);

        // Reserved
        v.extend([0u8; 2]);

        // Sample rate
        let rate = u16::try_from(s.get::<i32>("rate").context("no rate")?).unwrap_or(0);
        v.extend((u32::from(rate) << 16).to_be_bytes());

        // Codec specific boxes
        match s.name() {
            "audio/mpeg" => {
                let codec_data = s
                    .get::<&gst::BufferRef>("codec_data")
                    .context("no codec_data")?;
                let map = codec_data
                    .map_readable()
                    .context("codec_data not mappable")?;
                if map.len() < 2 {
                    bail!("too small codec_data");
                }
                write_esds_aac(v, &map)?;
            }
            "audio/x-opus" => {
                write_dops(v, &stream.caps)?;
            }
            "audio/x-alaw" | "audio/x-mulaw" | "audio/x-adpcm" => {
                // Nothing to do here
            }
            _ => unreachable!(),
        }

        // If rate did not fit into 16 bits write a full `srat` box
        if rate == 0 {
            let rate = s.get::<i32>("rate").context("no rate")?;
            // FIXME: This is defined as full box?
            write_full_box(
                v,
                b"srat",
                FULL_BOX_VERSION_0,
                FULL_BOX_FLAGS_NONE,
                move |v| {
                    v.extend((rate as u32).to_be_bytes());
                    Ok(())
                },
            )?;
        }

        // TODO: write btrt bitrate box based on tags

        // TODO: chnl box for channel ordering? probably not needed for AAC

        Ok(())
    })?;

    Ok(())
}

fn write_esds_aac(v: &mut Vec<u8>, codec_data: &[u8]) -> Result<(), Error> {
    let calculate_len = |mut len| {
        if len > 260144641 {
            bail!("too big descriptor length");
        }

        if len == 0 {
            return Ok(([0; 4], 1));
        }

        let mut idx = 0;
        let mut lens = [0u8; 4];
        while len > 0 {
            lens[idx] = ((if len > 0x7f { 0x80 } else { 0x00 }) | (len & 0x7f)) as u8;
            idx += 1;
            len >>= 7;
        }

        Ok((lens, idx))
    };

    write_full_box(
        v,
        b"esds",
        FULL_BOX_VERSION_0,
        FULL_BOX_FLAGS_NONE,
        move |v| {
            // Calculate all lengths bottom up

            // Decoder specific info
            let decoder_specific_info_len = calculate_len(codec_data.len())?;

            // Decoder config
            let decoder_config_len =
                calculate_len(13 + 1 + decoder_specific_info_len.1 + codec_data.len())?;

            // SL config
            let sl_config_len = calculate_len(1)?;

            // ES descriptor
            let es_descriptor_len = calculate_len(
                3 + 1
                    + decoder_config_len.1
                    + 13
                    + 1
                    + decoder_specific_info_len.1
                    + codec_data.len()
                    + 1
                    + sl_config_len.1
                    + 1,
            )?;

            // ES descriptor tag
            v.push(0x03);

            // Length
            v.extend_from_slice(&es_descriptor_len.0[..(es_descriptor_len.1)]);

            // Track ID
            v.extend(1u16.to_be_bytes());
            // Flags
            v.push(0u8);

            // Decoder config descriptor
            v.push(0x04);

            // Length
            v.extend_from_slice(&decoder_config_len.0[..(decoder_config_len.1)]);

            // Object type ESDS_OBJECT_TYPE_MPEG4_P3
            v.push(0x40);
            // Stream type ESDS_STREAM_TYPE_AUDIO
            v.push((0x05 << 2) | 0x01);

            // Buffer size db?
            v.extend([0u8; 3]);

            // Max bitrate
            v.extend(0u32.to_be_bytes());

            // Avg bitrate
            v.extend(0u32.to_be_bytes());

            // Decoder specific info
            v.push(0x05);

            // Length
            v.extend_from_slice(&decoder_specific_info_len.0[..(decoder_specific_info_len.1)]);
            v.extend_from_slice(codec_data);

            // SL config descriptor
            v.push(0x06);

            // Length: 1 (tag) + 1 (length) + 1 (predefined)
            v.extend_from_slice(&sl_config_len.0[..(sl_config_len.1)]);

            // Predefined
            v.push(0x02);
            Ok(())
        },
    )
}

fn write_dops(v: &mut Vec<u8>, caps: &gst::Caps) -> Result<(), Error> {
    let rate;
    let channels;
    let channel_mapping_family;
    let stream_count;
    let coupled_count;
    let pre_skip;
    let output_gain;
    let mut channel_mapping = [0; 256];

    // TODO: Use audio clipping meta to calculate pre_skip

    if let Some(header) = caps
        .structure(0)
        .unwrap()
        .get::<gst::ArrayRef>("streamheader")
        .ok()
        .and_then(|a| a.get(0).and_then(|v| v.get::<gst::Buffer>().ok()))
    {
        (
            rate,
            channels,
            channel_mapping_family,
            stream_count,
            coupled_count,
            pre_skip,
            output_gain,
        ) = gst_pbutils::codec_utils_opus_parse_header(&header, Some(&mut channel_mapping))
            .unwrap();
    } else {
        (
            rate,
            channels,
            channel_mapping_family,
            stream_count,
            coupled_count,
        ) = gst_pbutils::codec_utils_opus_parse_caps(caps, Some(&mut channel_mapping)).unwrap();
        output_gain = 0;
        pre_skip = 0;
    }

    write_box(v, b"dOps", move |v| {
        // Version number
        v.push(0);
        v.push(channels);
        v.extend(pre_skip.to_le_bytes());
        v.extend(rate.to_le_bytes());
        v.extend(output_gain.to_le_bytes());
        v.push(channel_mapping_family);
        if channel_mapping_family > 0 {
            v.push(stream_count);
            v.push(coupled_count);
            v.extend(&channel_mapping[..channels as usize]);
        }

        Ok(())
    })
}

fn write_stts(
    v: &mut Vec<u8>,
    _header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    let timescale = stream_to_timescale(stream);

    let entry_count_position = v.len();
    // Entry count, rewritten in the end
    v.extend(0u32.to_be_bytes());

    let mut last_duration: Option<u32> = None;
    let mut sample_count = 0u32;
    let mut num_entries = 0u32;
    for duration in stream
        .chunks
        .iter()
        .flat_map(|c| c.samples.iter().map(|b| b.duration))
    {
        let duration = u32::try_from(
            duration
                .nseconds()
                .mul_div_round(timescale as u64, gst::ClockTime::SECOND.nseconds())
                .context("too big sample duration")?,
        )
        .context("too big sample duration")?;

        if last_duration.map_or(true, |last_duration| last_duration != duration) {
            if let Some(last_duration) = last_duration {
                v.extend(sample_count.to_be_bytes());
                v.extend(last_duration.to_be_bytes());
                num_entries += 1;
            }

            last_duration = Some(duration);
            sample_count = 1;
        } else {
            sample_count += 1;
        }
    }

    if let Some(last_duration) = last_duration {
        v.extend(sample_count.to_be_bytes());
        v.extend(last_duration.to_be_bytes());
        num_entries += 1;
    }

    // Rewrite entry count
    v[entry_count_position..][..4].copy_from_slice(&num_entries.to_be_bytes());

    Ok(())
}

fn write_ctts(
    v: &mut Vec<u8>,
    _header: &super::Header,
    stream: &super::Stream,
    version: u8,
) -> Result<(), Error> {
    let timescale = stream_to_timescale(stream);

    let entry_count_position = v.len();
    // Entry count, rewritten in the end
    v.extend(0u32.to_be_bytes());

    let mut last_composition_time_offset = None;
    let mut sample_count = 0u32;
    let mut num_entries = 0u32;
    for composition_time_offset in stream
        .chunks
        .iter()
        .flat_map(|c| c.samples.iter().map(|b| b.composition_time_offset))
    {
        let composition_time_offset = composition_time_offset
            .expect("not all samples have a composition time offset")
            .mul_div_round(timescale as i64, gst::ClockTime::SECOND.nseconds() as i64)
            .context("too big sample composition time offset")?;

        if last_composition_time_offset.map_or(true, |last_composition_time_offset| {
            last_composition_time_offset != composition_time_offset
        }) {
            if let Some(last_composition_time_offset) = last_composition_time_offset {
                v.extend(sample_count.to_be_bytes());
                if version == FULL_BOX_VERSION_0 {
                    let last_composition_time_offset = u32::try_from(last_composition_time_offset)
                        .context("too big sample composition time offset")?;

                    v.extend(last_composition_time_offset.to_be_bytes());
                } else {
                    let last_composition_time_offset = i32::try_from(last_composition_time_offset)
                        .context("too big sample composition time offset")?;
                    v.extend(last_composition_time_offset.to_be_bytes());
                }
                num_entries += 1;
            }

            last_composition_time_offset = Some(composition_time_offset);
            sample_count = 1;
        } else {
            sample_count += 1;
        }
    }

    if let Some(last_composition_time_offset) = last_composition_time_offset {
        v.extend(sample_count.to_be_bytes());
        if version == FULL_BOX_VERSION_0 {
            let last_composition_time_offset = u32::try_from(last_composition_time_offset)
                .context("too big sample composition time offset")?;

            v.extend(last_composition_time_offset.to_be_bytes());
        } else {
            let last_composition_time_offset = i32::try_from(last_composition_time_offset)
                .context("too big sample composition time offset")?;
            v.extend(last_composition_time_offset.to_be_bytes());
        }
        num_entries += 1;
    }

    // Rewrite entry count
    v[entry_count_position..][..4].copy_from_slice(&num_entries.to_be_bytes());

    Ok(())
}

fn write_cslg(
    v: &mut Vec<u8>,
    _header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    let timescale = stream_to_timescale(stream);

    let (min_ctts, max_ctts) = stream
        .chunks
        .iter()
        .flat_map(|c| {
            c.samples.iter().map(|b| {
                b.composition_time_offset
                    .expect("not all samples have a composition time offset")
            })
        })
        .fold((None, None), |(min, max), ctts| {
            (
                if min.map_or(true, |min| ctts < min) {
                    Some(ctts)
                } else {
                    min
                },
                if max.map_or(true, |max| ctts > max) {
                    Some(ctts)
                } else {
                    max
                },
            )
        });
    let min_ctts = min_ctts
        .unwrap()
        .mul_div_round(timescale as i64, gst::ClockTime::SECOND.nseconds() as i64)
        .context("too big composition time offset")?;
    let max_ctts = max_ctts
        .unwrap()
        .mul_div_round(timescale as i64, gst::ClockTime::SECOND.nseconds() as i64)
        .context("too big composition time offset")?;

    // Composition to DTS shift
    v.extend((-min_ctts).to_be_bytes());

    // least decode to display delta
    v.extend(min_ctts.to_be_bytes());

    // greatest decode to display delta
    v.extend(max_ctts.to_be_bytes());

    // composition start time
    let composition_start_time = stream
        .earliest_pts
        .nseconds()
        .mul_div_round(timescale as u64, gst::ClockTime::SECOND.nseconds() as u64)
        .context("too earliest PTS")?;
    v.extend(composition_start_time.to_be_bytes());

    // composition end time
    let composition_end_time = stream
        .end_pts
        .nseconds()
        .mul_div_round(timescale as u64, gst::ClockTime::SECOND.nseconds() as u64)
        .context("too end PTS")?;
    v.extend(composition_end_time.to_be_bytes());

    Ok(())
}

fn write_stss(
    v: &mut Vec<u8>,
    _header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    let entry_count_position = v.len();
    // Entry count, rewritten in the end
    v.extend(0u32.to_be_bytes());

    let mut num_entries = 0u32;
    for (idx, _sync_point) in stream
        .chunks
        .iter()
        .flat_map(|c| c.samples.iter().map(|b| b.sync_point))
        .enumerate()
        .filter(|(_idx, sync_point)| *sync_point)
    {
        v.extend((idx as u32 + 1).to_be_bytes());
        num_entries += 1;
    }

    // Rewrite entry count
    v[entry_count_position..][..4].copy_from_slice(&num_entries.to_be_bytes());

    Ok(())
}

fn write_stsz(
    v: &mut Vec<u8>,
    _header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    let first_sample_size = stream.chunks[0].samples[0].size;

    if stream
        .chunks
        .iter()
        .flat_map(|c| c.samples.iter().map(|b| b.size))
        .all(|size| size == first_sample_size)
    {
        // Sample size
        v.extend(first_sample_size.to_be_bytes());

        // Sample count
        let sample_count = stream
            .chunks
            .iter()
            .map(|c| c.samples.len() as u32)
            .sum::<u32>();
        v.extend(sample_count.to_be_bytes());
    } else {
        // Sample size
        v.extend(0u32.to_be_bytes());

        // Sample count, will be rewritten later
        let sample_count_position = v.len();
        let mut sample_count = 0u32;
        v.extend(0u32.to_be_bytes());

        for size in stream
            .chunks
            .iter()
            .flat_map(|c| c.samples.iter().map(|b| b.size))
        {
            v.extend(size.to_be_bytes());
            sample_count += 1;
        }

        v[sample_count_position..][..4].copy_from_slice(&sample_count.to_be_bytes());
    }

    Ok(())
}

fn write_stsc(
    v: &mut Vec<u8>,
    _header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    let entry_count_position = v.len();
    // Entry count, rewritten in the end
    v.extend(0u32.to_be_bytes());

    let mut num_entries = 0u32;
    let mut first_chunk = 1u32;
    let mut samples_per_chunk: Option<u32> = None;
    for (idx, chunk) in stream.chunks.iter().enumerate() {
        if samples_per_chunk.map_or(true, |samples_per_chunk| {
            samples_per_chunk != chunk.samples.len() as u32
        }) {
            if let Some(samples_per_chunk) = samples_per_chunk {
                v.extend(first_chunk.to_be_bytes());
                v.extend(samples_per_chunk.to_be_bytes());
                // sample description index
                v.extend(1u32.to_be_bytes());
                num_entries += 1;
            }
            samples_per_chunk = Some(chunk.samples.len() as u32);
            first_chunk = idx as u32 + 1;
        }
    }

    if let Some(samples_per_chunk) = samples_per_chunk {
        v.extend(first_chunk.to_be_bytes());
        v.extend(samples_per_chunk.to_be_bytes());
        // sample description index
        v.extend(1u32.to_be_bytes());
        num_entries += 1;
    }

    // Rewrite entry count
    v[entry_count_position..][..4].copy_from_slice(&num_entries.to_be_bytes());

    Ok(())
}

fn write_stco(
    v: &mut Vec<u8>,
    _header: &super::Header,
    stream: &super::Stream,
    co64: bool,
) -> Result<(), Error> {
    // Entry count
    v.extend((stream.chunks.len() as u32).to_be_bytes());

    for chunk in &stream.chunks {
        if co64 {
            v.extend(chunk.offset.to_be_bytes());
        } else {
            v.extend(u32::try_from(chunk.offset).unwrap().to_be_bytes());
        }
    }

    Ok(())
}

fn write_edts(
    v: &mut Vec<u8>,
    header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    write_full_box(v, b"elst", FULL_BOX_VERSION_1, 0, |v| {
        write_elst(v, header, stream)
    })?;

    Ok(())
}

fn write_elst(
    v: &mut Vec<u8>,
    header: &super::Header,
    stream: &super::Stream,
) -> Result<(), Error> {
    // In movie header timescale
    let timescale = header_to_timescale(header);

    let min_earliest_pts = header.streams.iter().map(|s| s.earliest_pts).min().unwrap();

    if min_earliest_pts != stream.earliest_pts {
        // Entry count
        v.extend(2u32.to_be_bytes());

        // First entry for the gap

        // Edit duration
        let gap = (stream.earliest_pts - min_earliest_pts)
            .nseconds()
            .mul_div_round(timescale as u64, gst::ClockTime::SECOND.nseconds())
            .context("too big gap")?;
        v.extend(gap.to_be_bytes());

        // Media time
        v.extend((-1i64).to_be_bytes());

        // Media rate
        v.extend(1u16.to_be_bytes());
        v.extend(0u16.to_be_bytes());
    } else {
        // Entry count
        v.extend(1u32.to_be_bytes());
    }

    // Edit duration
    let duration = (stream.end_pts - stream.earliest_pts)
        .nseconds()
        .mul_div_round(timescale as u64, gst::ClockTime::SECOND.nseconds())
        .context("too big track duration")?;
    v.extend(duration.to_be_bytes());

    // Media time
    if let Some(gst::Signed::Negative(start_dts)) = stream.start_dts {
        let shift = (stream.earliest_pts + start_dts)
            .nseconds()
            .mul_div_round(timescale as u64, gst::ClockTime::SECOND.nseconds())
            .context("too big track duration")?;

        v.extend(shift.to_be_bytes());
    } else {
        v.extend(0u64.to_be_bytes());
    }

    // Media rate
    v.extend(1u16.to_be_bytes());
    v.extend(0u16.to_be_bytes());

    Ok(())
}