net/aws/transcriber: translate: optional experimental translation tokenization

This commit adds an optional experimental translation tokenization feature. It can be activated using the `translation_src_%u` pads property `tokenization-method`. For the moment, the feature is deactivated by default. The Translate ws accepts '<span></span>' tags in the input and adds matching tags in the output. When an 'id' is also provided as an attribute of the 'span', the matching output tag also uses this 'id'. In the context of close captions, the 'id's are of little use. However, we can take advantage of the spans in the output to identify translation chunks, which more or less reflect the rythm of the input transcript. This commit adds simples spans (no 'id') to the input Transcript Items and parses the resulting spans in the translated output, assigning the timestamps and durations sequentially from the input Transcript Items. Edge cases such as absence of spans, nested spans were observed and are handled here. Similarly, mismatches between the number of input and output items are taken care of by some sort of reconcialiation. Note that this is still experimental and requires further testings. Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs/-/merge_requests/1109>
2024-05-20 09:18:15 +00:00 · 2023-03-11 16:27:51 +01:00 · 2023-03-11 16:27:51 +01:00 · 299e25ab3c
parent 743e97738f
commit 299e25ab3c
5 changed files with 542 additions and 92 deletions
--- a/docs/plugins/gst_plugins_cache.json
+++ b/docs/plugins/gst_plugins_cache.json
@ -650,6 +650,12 @@
                        "direction": "src",
                        "presence": "request",
                        "type": "GstTranslationSrcPad"
+                    },
+                    "translation_src_%%u": {
+                        "caps": "text/x-raw:\n         format: utf8\n",
+                        "direction": "src",
+                        "presence": "request",
+                        "type": "GstTranslationSrcPad"
                    }
                },
                "properties": {
@ -773,7 +779,7 @@
                        "construct": false,
                        "construct-only": false,
                        "controllable": false,
-                        "default": "3000",
+                        "default": "5000",
                        "max": "-1",
                        "min": "0",
                        "mutable": "ready",
@ -858,6 +864,21 @@
                    }
                ]
            },
+            "GstAwsTranscriberTranslationTokenizationMethod": {
+                "kind": "enum",
+                "values": [
+                    {
+                        "desc": "None: don't tokenize translations",
+                        "name": "none",
+                        "value": "0"
+                    },
+                    {
+                        "desc": "Span based: insert spans in the transript text and use the resulting spans in the translations to reproduce speech pacing.",
+                        "name": "span-based",
+                        "value": "1"
+                    }
+                ]
+            },
            "GstAwsTranscriberVocabularyFilterMethod": {
                "kind": "enum",
                "values": [
@ -919,6 +940,18 @@
                        "readable": true,
                        "type": "gchararray",
                        "writable": true
+                    },
+                    "tokenization-method": {
+                        "blurb": "The tokenization method to apply to translations",
+                        "conditionally-available": false,
+                        "construct": false,
+                        "construct-only": false,
+                        "controllable": false,
+                        "default": "none (0)",
+                        "mutable": "ready",
+                        "readable": true,
+                        "type": "GstAwsTranscriberTranslationTokenizationMethod",
+                        "writable": true
                    }
                }
            }
--- a/net/aws/src/transcriber/imp.rs
+++ b/net/aws/src/transcriber/imp.rs
@ -35,7 +35,10 @@ use once_cell::sync::Lazy;

 use super::transcribe::{TranscriberLoop, TranscriptEvent, TranscriptItem, TranscriptionSettings};
 use super::translate::{TranslatedItem, TranslationLoop, TranslationQueue};
-use super::{AwsTranscriberResultStability, AwsTranscriberVocabularyFilterMethod, CAT};
+use super::{
+    AwsTranscriberResultStability, AwsTranscriberVocabularyFilterMethod,
+    TranslationTokenizationMethod, CAT,
+};

 static RUNTIME: Lazy<runtime::Runtime> = Lazy::new(|| {
    runtime::Builder::new_multi_thread()
@ -73,6 +76,8 @@ pub const GRANULARITY: gst::ClockTime = gst::ClockTime::from_mseconds(100);
 const OUTPUT_LANG_CODE_PROPERTY: &str = "language-code";
 const DEFAULT_OUTPUT_LANG_CODE: Option<&str> = None;

+const TRANSLATION_TOKENIZATION_PROPERTY: &str = "tokenization-method";
+
 #[derive(Debug, Clone)]
 pub(super) struct Settings {
    transcribe_latency: gst::ClockTime,
@ -850,8 +855,8 @@ struct TranslationPadTask {
    needs_translate: bool,
    translation_queue: TranslationQueue,
    translation_loop_handle: Option<task::JoinHandle<Result<(), gst::ErrorMessage>>>,
-    to_translation_tx: Option<mpsc::Sender<TranscriptItem>>,
-    from_translation_rx: Option<mpsc::Receiver<TranslatedItem>>,
+    to_translation_tx: Option<mpsc::Sender<Vec<TranscriptItem>>>,
+    from_translation_rx: Option<mpsc::Receiver<Vec<TranslatedItem>>>,
    translate_latency: gst::ClockTime,
    transcript_lookahead: gst::ClockTime,
    send_events: bool,
@ -991,14 +996,14 @@ impl TranslationPadTask {
            // before current latency budget is exhausted.
            futures::select_biased! {
                _ = timeout => return Ok(()),
-                translated_item = from_translation_rx.next() => {
-                    let Some(translated_item) = translated_item else {
+                translated_items = from_translation_rx.next() => {
+                    let Some(translated_items) = translated_items else {
                        const ERR: &str = "translation chan terminated";
                        gst::debug!(CAT, imp: self.pad, "{ERR}");
                        return Err(gst::error_msg!(gst::StreamError::Failed, ["{ERR}"]));
                    };

-                    self.translated_items.push_back(translated_item);
+                    self.translated_items.extend(translated_items);
                    self.pending_translations = self.pending_translations.saturating_sub(1);

                    return Ok(());
@ -1027,9 +1032,9 @@ impl TranslationPadTask {
            }
        };

-        for item in transcript_items.iter() {
-            if let Some(ready_item) = self.translation_queue.push(item) {
-                self.send_for_translation(ready_item).await?;
+        for items in transcript_items.iter() {
+            if let Some(ready_items) = self.translation_queue.push(items) {
+                self.send_for_translation(ready_items).await?;
            }
        }

@ -1072,19 +1077,12 @@ impl TranslationPadTask {

            let deadline = translation_eta.saturating_sub(max_delay);

-            if let Some(ready_item) = self
+            if let Some(ready_items) = self
                .translation_queue
                .dequeue(deadline, self.transcript_lookahead)
            {
-                gst::debug!(
-                    CAT,
-                    imp: self.pad,
-                    "Forcing transcript at pts {} with duration {} to translation",
-                    ready_item.pts,
-                    ready_item.duration,
-                );
-
-                if self.send_for_translation(ready_item).await.is_err() {
+                gst::debug!(CAT, imp: self.pad, "Forcing  {} transcripts to translation", ready_items.len());
+                if self.send_for_translation(ready_items).await.is_err() {
                    return false;
                }
            }
@ -1240,13 +1238,13 @@ impl TranslationPadTask {

    async fn send_for_translation(
        &mut self,
-        transcript_item: TranscriptItem,
+        transcript_items: Vec<TranscriptItem>,
    ) -> Result<(), gst::ErrorMessage> {
        let res = self
            .to_translation_tx
            .as_mut()
            .expect("to_translation chan must be available in translation mode")
-            .send(transcript_item)
+            .send(transcript_items)
            .await;

        if res.is_err() {
@ -1346,6 +1344,7 @@ impl TranslationPadTask {
                    &self.pad,
                    &elem_settings.language_code,
                    pad_settings.language_code.as_deref().unwrap(),
+                    pad_settings.tokenization_method,
                    to_translation_rx,
                    from_translation_tx,
                ));
@ -1384,6 +1383,7 @@ impl Default for TranslationPadState {
 #[derive(Debug, Default, Clone)]
 struct TranslationPadSettings {
    language_code: Option<String>,
+    tokenization_method: TranslationTokenizationMethod,
 }

 #[derive(Debug, Default)]
@ -1566,12 +1566,20 @@ impl ObjectSubclass for TranslationSrcPad {
 impl ObjectImpl for TranslationSrcPad {
    fn properties() -> &'static [glib::ParamSpec] {
        static PROPERTIES: Lazy<Vec<glib::ParamSpec>> = Lazy::new(|| {
-            vec![glib::ParamSpecString::builder(OUTPUT_LANG_CODE_PROPERTY)
-                .nick("Language Code")
-                .blurb("The Language the Stream must be translated to")
-                .default_value(DEFAULT_OUTPUT_LANG_CODE)
-                .mutable_ready()
-                .build()]
+            vec![
+                glib::ParamSpecString::builder(OUTPUT_LANG_CODE_PROPERTY)
+                    .nick("Language Code")
+                    .blurb("The Language the Stream must be translated to")
+                    .default_value(DEFAULT_OUTPUT_LANG_CODE)
+                    .mutable_ready()
+                    .build(),
+                glib::ParamSpecEnum::builder(TRANSLATION_TOKENIZATION_PROPERTY)
+                    .nick("Translations tokenization method")
+                    .blurb("The tokenization method to apply to translations")
+                    .default_value(TranslationTokenizationMethod::default())
+                    .mutable_ready()
+                    .build(),
+            ]
        });

        PROPERTIES.as_ref()
@ -1582,6 +1590,9 @@ impl ObjectImpl for TranslationSrcPad {
            OUTPUT_LANG_CODE_PROPERTY => {
                self.settings.lock().unwrap().language_code = value.get().unwrap()
            }
+            TRANSLATION_TOKENIZATION_PROPERTY => {
+                self.settings.lock().unwrap().tokenization_method = value.get().unwrap()
+            }
            _ => unimplemented!(),
        }
    }
@ -1589,6 +1600,9 @@ impl ObjectImpl for TranslationSrcPad {
    fn property(&self, _id: usize, pspec: &glib::ParamSpec) -> glib::Value {
        match pspec.name() {
            OUTPUT_LANG_CODE_PROPERTY => self.settings.lock().unwrap().language_code.to_value(),
+            TRANSLATION_TOKENIZATION_PROPERTY => {
+                self.settings.lock().unwrap().tokenization_method.to_value()
+            }
            _ => unimplemented!(),
        }
    }
--- a/net/aws/src/transcriber/mod.rs
+++ b/net/aws/src/transcriber/mod.rs
@ -79,6 +79,21 @@ impl From<AwsTranscriberVocabularyFilterMethod> for VocabularyFilterMethod {
    }
 }

+#[derive(Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash, Clone, Copy, glib::Enum)]
+#[repr(u32)]
+#[enum_type(name = "GstAwsTranscriberTranslationTokenizationMethod")]
+#[non_exhaustive]
+pub enum TranslationTokenizationMethod {
+    #[default]
+    #[enum_value(name = "None: don't tokenize translations", nick = "none")]
+    None = 0,
+    #[enum_value(
+        name = "Span based: insert spans in the transript text and use the resulting spans in the translations to reproduce speech pacing.",
+        nick = "span-based"
+    )]
+    SpanBased = 1,
+}
+
 glib::wrapper! {
    pub struct Transcriber(ObjectSubclass<imp::Transcriber>) @extends gst::Element, gst::Object, @implements gst::ChildProxy;
 }
@ -94,6 +109,8 @@ pub fn register(plugin: &gst::Plugin) -> Result<(), glib::BoolError> {
            .mark_as_plugin_api(gst::PluginAPIFlags::empty());
        AwsTranscriberVocabularyFilterMethod::static_type()
            .mark_as_plugin_api(gst::PluginAPIFlags::empty());
+        TranslationTokenizationMethod::static_type()
+            .mark_as_plugin_api(gst::PluginAPIFlags::empty());
        TranslationSrcPad::static_type().mark_as_plugin_api(gst::PluginAPIFlags::empty());
    }
    gst::Element::register(
--- a/net/aws/src/transcriber/transcribe.rs
+++ b/net/aws/src/transcriber/transcribe.rs
@ -69,18 +69,6 @@ impl TranscriptItem {
            is_punctuation: matches!(item.r#type, Some(model::ItemType::Punctuation)),
        })
    }
-
-    #[inline]
-    pub fn push(&mut self, item: &TranscriptItem) {
-        self.duration += item.duration;
-
-        self.is_punctuation &= item.is_punctuation;
-        if !item.is_punctuation {
-            self.content.push(' ');
-        }
-
-        self.content.push_str(&item.content);
-    }
 }

 #[derive(Clone)]
--- a/net/aws/src/transcriber/translate.rs
+++ b/net/aws/src/transcriber/translate.rs
@ -18,8 +18,12 @@ use std::collections::VecDeque;

 use super::imp::TranslationSrcPad;
 use super::transcribe::TranscriptItem;
-use super::CAT;
+use super::{TranslationTokenizationMethod, CAT};

+const SPAN_START: &str = "<span>";
+const SPAN_END: &str = "</span>";
+
+#[derive(Debug)]
 pub struct TranslatedItem {
    pub pts: gst::ClockTime,
    pub duration: gst::ClockTime,
@ -49,7 +53,7 @@ impl TranslationQueue {
    /// Pushes the provided item.
    ///
    /// Returns `Some(..)` if items are ready for translation.
-    pub fn push(&mut self, transcript_item: &TranscriptItem) -> Option<TranscriptItem> {
+    pub fn push(&mut self, transcript_item: &TranscriptItem) -> Option<Vec<TranscriptItem>> {
        // Keep track of the item individually so we can schedule translation precisely.
        self.items.push_back(transcript_item.clone());

@ -57,16 +61,7 @@ impl TranslationQueue {
            // This makes it a good chunk for translation.
            // Concatenate as a single item for translation

-            let mut items = self.items.drain(..);
-
-            let mut item_acc = items.next()?;
-            for item in items {
-                item_acc.push(&item);
-            }
-
-            item_acc.push(transcript_item);
-
-            return Some(item_acc);
+            return Some(self.items.drain(..).collect());
        }

        // Regular case: no separator detected, don't push transcript items
@ -78,12 +73,12 @@ impl TranslationQueue {

    /// Dequeues items from the specified `deadline` up to `lookahead`.
    ///
-    /// Returns `Some(..)` with the accumulated items matching the criteria.
+    /// Returns `Some(..)` if some items match the criteria.
    pub fn dequeue(
        &mut self,
        deadline: gst::ClockTime,
        lookahead: gst::ClockTime,
-    ) -> Option<TranscriptItem> {
+    ) -> Option<Vec<TranscriptItem>> {
        if self.items.front()?.pts < deadline {
            // First item is too early to be sent to translation now
            // we can wait for more items to accumulate.
@ -94,17 +89,16 @@ impl TranslationQueue {
        // Try to get up to lookahead more items to improve translation accuracy
        let limit = deadline + lookahead;

-        let mut item_acc = self.items.pop_front().unwrap();
+        let mut items_acc = vec![self.items.pop_front().unwrap()];
        while let Some(item) = self.items.front() {
            if item.pts > limit {
                break;
            }

-            let item = self.items.pop_front().unwrap();
-            item_acc.push(&item);
+            items_acc.push(self.items.pop_front().unwrap());
        }

-        Some(item_acc)
+        Some(items_acc)
    }
 }

@ -113,8 +107,9 @@ pub struct TranslationLoop {
    client: aws_translate::Client,
    input_lang: String,
    output_lang: String,
-    transcript_rx: mpsc::Receiver<TranscriptItem>,
-    translation_tx: mpsc::Sender<TranslatedItem>,
+    tokenization_method: TranslationTokenizationMethod,
+    transcript_rx: mpsc::Receiver<Vec<TranscriptItem>>,
+    translation_tx: mpsc::Sender<Vec<TranslatedItem>>,
 }

 impl TranslationLoop {
@ -123,8 +118,9 @@ impl TranslationLoop {
        pad: &TranslationSrcPad,
        input_lang: &str,
        output_lang: &str,
-        transcript_rx: mpsc::Receiver<TranscriptItem>,
-        translation_tx: mpsc::Sender<TranslatedItem>,
+        tokenization_method: TranslationTokenizationMethod,
+        transcript_rx: mpsc::Receiver<Vec<TranscriptItem>>,
+        translation_tx: mpsc::Sender<Vec<TranslatedItem>>,
    ) -> Self {
        let aws_config = imp.aws_config.lock().unwrap();
        let aws_config = aws_config
@ -136,6 +132,7 @@ impl TranslationLoop {
            client: aws_sdk_translate::Client::new(aws_config),
            input_lang: input_lang.to_string(),
            output_lang: output_lang.to_string(),
+            tokenization_method,
            transcript_rx,
            translation_tx,
        }
@ -167,40 +164,70 @@ impl TranslationLoop {
    }

    pub async fn run(mut self) -> Result<(), gst::ErrorMessage> {
-        while let Some(transcript_item) = self.transcript_rx.next().await {
-            let TranscriptItem {
-                pts,
-                duration,
-                content,
-                ..
-            } = transcript_item;
+        use TranslationTokenizationMethod as Tokenization;

-            let translated_text = if content.is_empty() {
-                content
-            } else {
-                self.client
-                    .translate_text()
-                    .set_source_language_code(Some(self.input_lang.clone()))
-                    .set_target_language_code(Some(self.output_lang.clone()))
-                    .set_text(Some(content))
-                    .send()
-                    .await
-                    .map_err(|err| {
-                        let err = format!("Failed to call translation service: {err}");
-                        gst::info!(CAT, imp: self.pad, "{err}");
-                        gst::error_msg!(gst::LibraryError::Failed, ["{err}"])
-                    })?
-                    .translated_text
-                    .unwrap_or_default()
+        while let Some(transcript_items) = self.transcript_rx.next().await {
+            if transcript_items.is_empty() {
+                continue;
+            }
+
+            let (ts_duration_list, content): (Vec<(gst::ClockTime, gst::ClockTime)>, String) =
+                transcript_items
+                    .into_iter()
+                    .map(|item| {
+                        (
+                            (item.pts, item.duration),
+                            match self.tokenization_method {
+                                Tokenization::None => item.content,
+                                Tokenization::SpanBased => {
+                                    format!("{SPAN_START}{}{SPAN_END}", item.content)
+                                }
+                            },
+                        )
+                    })
+                    .unzip();
+
+            gst::trace!(CAT, imp: self.pad, "Translating {content} with {ts_duration_list:?}");
+
+            let translated_text = self
+                .client
+                .translate_text()
+                .set_source_language_code(Some(self.input_lang.clone()))
+                .set_target_language_code(Some(self.output_lang.clone()))
+                .set_text(Some(content))
+                .send()
+                .await
+                .map_err(|err| {
+                    let err = format!("Failed to call translation service: {err}");
+                    gst::info!(CAT, imp: self.pad, "{err}");
+                    gst::error_msg!(gst::LibraryError::Failed, ["{err}"])
+                })?
+                .translated_text
+                .unwrap_or_default();
+
+            gst::trace!(CAT, imp: self.pad, "Got translation {translated_text}");
+
+            let translated_items = match self.tokenization_method {
+                Tokenization::None => {
+                    // Push translation as a single item
+                    let mut ts_duration_iter = ts_duration_list.into_iter().peekable();
+
+                    let &(first_pts, _) = ts_duration_iter.peek().expect("at least one item");
+                    let (last_pts, last_duration) =
+                        ts_duration_iter.last().expect("at least one item");
+
+                    vec![TranslatedItem {
+                        pts: first_pts,
+                        duration: last_pts.saturating_sub(first_pts) + last_duration,
+                        content: translated_text,
+                    }]
+                }
+                Tokenization::SpanBased => span_tokenize_items(&translated_text, ts_duration_list),
            };

-            let translated_item = TranslatedItem {
-                pts,
-                duration,
-                content: translated_text,
-            };
+            gst::trace!(CAT, imp: self.pad, "Sending {translated_items:?}");

-            if self.translation_tx.send(translated_item).await.is_err() {
+            if self.translation_tx.send(translated_items).await.is_err() {
                gst::info!(
                    CAT,
                    imp: self.pad,
@ -213,3 +240,374 @@ impl TranslationLoop {
        Ok(())
    }
 }
+
+/// Parses translated items from the `translation` `String` using `span` tags.
+///
+/// The `translation` is expected to have been returned by the `Translate` ws.
+/// It can contain id-less `<span>` and `</span>` tags, matching similar
+/// id-less tags from the content submitted to the `Translate` ws.
+///
+/// This parser accepts both serial `<span></span>` as well as nested
+/// `<span><span></span></span>`.
+///
+/// The parsed items are assigned the ts and duration from `ts_duration_list`
+/// in their order of appearance.
+///
+/// If more parsed items are found, the last item will concatenate the remaining items.
+///
+/// If less parsed items are found, the last item will be assign the remaining
+/// duration from the `ts_duration_list`.
+fn span_tokenize_items(
+    translation: &str,
+    ts_duration_list: impl IntoIterator<Item = (gst::ClockTime, gst::ClockTime)>,
+) -> Vec<TranslatedItem> {
+    const SPAN_START_LEN: usize = SPAN_START.len();
+    const SPAN_END_LEN: usize = SPAN_END.len();
+
+    let mut translated_items = vec![];
+
+    let mut ts_duration_iter = ts_duration_list.into_iter();
+
+    // Content for a translated item
+    let mut content = String::new();
+
+    // Alleged span chunk
+    let mut chunk = String::new();
+
+    for c in translation.chars() {
+        if content.is_empty() && c.is_whitespace() {
+            // ignore leading whitespaces
+            continue;
+        }
+
+        if chunk.is_empty() {
+            if c == '<' {
+                // Start an alleged span chunk
+                chunk.push(c);
+            } else {
+                content.push(c);
+            }
+
+            continue;
+        }
+
+        chunk.push(c);
+
+        match chunk.len() {
+            len if len < SPAN_START_LEN => continue,
+            SPAN_START_LEN => {
+                if chunk != SPAN_START {
+                    continue;
+                }
+                // Got a <span>
+            }
+            SPAN_END_LEN => {
+                if chunk != SPAN_END {
+                    continue;
+                }
+                // Got a </span>
+            }
+            _ => {
+                // Can no longer be a span
+                content.extend(chunk.drain(..));
+                continue;
+            }
+        }
+
+        // got a span
+        chunk.clear();
+
+        if content.is_empty() {
+            continue;
+        }
+
+        // Add pending content
+        // assign it the next pts and duration from the input list
+        if let Some((pts, duration)) = ts_duration_iter.next() {
+            translated_items.push(TranslatedItem {
+                pts,
+                duration,
+                content,
+            });
+
+            content = String::new();
+        } else if let Some(last_item) = translated_items.last_mut() {
+            // exhausted available pts and duration
+            // add content to last item
+            if !last_item.content.ends_with(' ') {
+                last_item.content.push(' ');
+            }
+            last_item.content.extend(content.drain(..));
+        }
+    }
+
+    content.extend(chunk.drain(..));
+
+    if !content.is_empty() {
+        // Add last content
+        if let Some((pts, mut duration)) = ts_duration_iter.next() {
+            if let Some((last_pts, last_duration)) = ts_duration_iter.last() {
+                // Fix remaining duration
+                duration = last_pts.saturating_sub(pts) + last_duration;
+            }
+
+            translated_items.push(TranslatedItem {
+                pts,
+                duration,
+                content,
+            });
+        } else if let Some(last_item) = translated_items.last_mut() {
+            // No more pts and duration in the index
+            // Add remaining content to the last item pushed
+            if !last_item.content.ends_with(' ') {
+                last_item.content.push(' ');
+            }
+            last_item.content.push_str(&content);
+        }
+    } else if let Some((last_pts, last_duration)) = ts_duration_iter.last() {
+        if let Some(last_item) = translated_items.last_mut() {
+            // No more content, but need to fix last item's duration
+            last_item.duration = last_pts.saturating_sub(last_item.pts) + last_duration;
+        }
+    }
+
+    translated_items
+}
+
+#[cfg(test)]
+mod tests {
+    use super::span_tokenize_items;
+    use gst::prelude::*;
+
+    #[test]
+    fn serial_spans() {
+        let input = "<span>first</span> <span>second</span> <span>third</span>";
+        let ts_duration_list = vec![
+            (0.seconds(), 1.seconds()),
+            (1.seconds(), 2.seconds()),
+            (4.seconds(), 3.seconds()),
+        ];
+
+        let mut items = span_tokenize_items(input, ts_duration_list).into_iter();
+
+        let first = items.next().unwrap();
+        assert_eq!(first.pts, 0.seconds());
+        assert_eq!(first.duration, 1.seconds());
+        assert_eq!(first.content, "first");
+
+        let second = items.next().unwrap();
+        assert_eq!(second.pts, 1.seconds());
+        assert_eq!(second.duration, 2.seconds());
+        assert_eq!(second.content, "second");
+
+        let third = items.next().unwrap();
+        assert_eq!(third.pts, 4.seconds());
+        assert_eq!(third.duration, 3.seconds());
+        assert_eq!(third.content, "third");
+
+        assert!(items.next().is_none());
+    }
+
+    #[test]
+    fn serial_and_nested_spans() {
+        let input = "<span>first</span> <span>second <span>third</span></span> <span>fourth</span>";
+        let ts_duration_list = vec![
+            (0.seconds(), 1.seconds()),
+            (1.seconds(), 2.seconds()),
+            (3.seconds(), 1.seconds()),
+            (4.seconds(), 2.seconds()),
+        ];
+
+        let mut items = span_tokenize_items(input, ts_duration_list).into_iter();
+
+        let first = items.next().unwrap();
+        assert_eq!(first.pts, 0.seconds());
+        assert_eq!(first.duration, 1.seconds());
+        assert_eq!(first.content, "first");
+
+        let second = items.next().unwrap();
+        assert_eq!(second.pts, 1.seconds());
+        assert_eq!(second.duration, 2.seconds());
+        assert_eq!(second.content, "second ");
+
+        let third = items.next().unwrap();
+        assert_eq!(third.pts, 3.seconds());
+        assert_eq!(third.duration, 1.seconds());
+        assert_eq!(third.content, "third");
+
+        let fourth = items.next().unwrap();
+        assert_eq!(fourth.pts, 4.seconds());
+        assert_eq!(fourth.duration, 2.seconds());
+        assert_eq!(fourth.content, "fourth");
+
+        assert!(items.next().is_none());
+    }
+
+    #[test]
+    fn nonspaned_serial_and_nested_spans() {
+        let input = "Initial <span>first</span> <span>second <span>third</span></span> <span>fourth</span> final";
+        let ts_duration_list = vec![
+            (0.seconds(), 1.seconds()),
+            (1.seconds(), 1.seconds()),
+            (2.seconds(), 1.seconds()),
+            (3.seconds(), 1.seconds()),
+            (4.seconds(), 1.seconds()),
+            (5.seconds(), 1.seconds()),
+        ];
+
+        let mut items = span_tokenize_items(input, ts_duration_list).into_iter();
+
+        let init = items.next().unwrap();
+        assert_eq!(init.pts, 0.seconds());
+        assert_eq!(init.duration, 1.seconds());
+        assert_eq!(init.content, "Initial ");
+
+        let first = items.next().unwrap();
+        assert_eq!(first.pts, 1.seconds());
+        assert_eq!(first.duration, 1.seconds());
+        assert_eq!(first.content, "first");
+
+        let second = items.next().unwrap();
+        assert_eq!(second.pts, 2.seconds());
+        assert_eq!(second.duration, 1.seconds());
+        assert_eq!(second.content, "second ");
+
+        let third = items.next().unwrap();
+        assert_eq!(third.pts, 3.seconds());
+        assert_eq!(third.duration, 1.seconds());
+        assert_eq!(third.content, "third");
+
+        let fourth = items.next().unwrap();
+        assert_eq!(fourth.pts, 4.seconds());
+        assert_eq!(fourth.duration, 1.seconds());
+        assert_eq!(fourth.content, "fourth");
+
+        let final_ = items.next().unwrap();
+        assert_eq!(final_.pts, 5.seconds());
+        assert_eq!(final_.duration, 1.seconds());
+        assert_eq!(final_.content, "final");
+
+        assert!(items.next().is_none());
+    }
+
+    #[test]
+    fn more_parsed_items() {
+        let input = "<span>first</span> <span>second</span> <span>third</span> <span>fourth</span>";
+        let ts_duration_list = vec![
+            (0.seconds(), 1.seconds()),
+            (1.seconds(), 2.seconds()),
+            (4.seconds(), 3.seconds()),
+        ];
+
+        let mut items = span_tokenize_items(input, ts_duration_list).into_iter();
+
+        let first = items.next().unwrap();
+        assert_eq!(first.pts, 0.seconds());
+        assert_eq!(first.duration, 1.seconds());
+        assert_eq!(first.content, "first");
+
+        let second = items.next().unwrap();
+        assert_eq!(second.pts, 1.seconds());
+        assert_eq!(second.duration, 2.seconds());
+        assert_eq!(second.content, "second");
+
+        let third = items.next().unwrap();
+        assert_eq!(third.pts, 4.seconds());
+        assert_eq!(third.duration, 3.seconds());
+        assert_eq!(third.content, "third fourth");
+
+        assert!(items.next().is_none());
+    }
+
+    #[test]
+    fn more_parsed_items_nonspan_final() {
+        let input = "<span>first</span> <span>second</span> <span>third</span> final";
+        let ts_duration_list = vec![
+            (0.seconds(), 1.seconds()),
+            (1.seconds(), 2.seconds()),
+            (4.seconds(), 3.seconds()),
+        ];
+
+        let mut items = span_tokenize_items(input, ts_duration_list).into_iter();
+
+        let first = items.next().unwrap();
+        assert_eq!(first.pts, 0.seconds());
+        assert_eq!(first.duration, 1.seconds());
+        assert_eq!(first.content, "first");
+
+        let second = items.next().unwrap();
+        assert_eq!(second.pts, 1.seconds());
+        assert_eq!(second.duration, 2.seconds());
+        assert_eq!(second.content, "second");
+
+        let third = items.next().unwrap();
+        assert_eq!(third.pts, 4.seconds());
+        assert_eq!(third.duration, 3.seconds());
+        assert_eq!(third.content, "third final");
+
+        assert!(items.next().is_none());
+    }
+
+    #[test]
+    fn less_parsed_items() {
+        let input = "<span>first</span> <span>second</span>";
+        let ts_duration_list = vec![
+            (0.seconds(), 1.seconds()),
+            (1.seconds(), 2.seconds()),
+            (4.seconds(), 3.seconds()),
+        ];
+
+        let mut items = span_tokenize_items(input, ts_duration_list).into_iter();
+
+        let first = items.next().unwrap();
+        assert_eq!(first.pts, 0.seconds());
+        assert_eq!(first.duration, 1.seconds());
+        assert_eq!(first.content, "first");
+
+        let second = items.next().unwrap();
+        assert_eq!(second.pts, 1.seconds());
+        assert_eq!(second.duration, 6.seconds());
+        assert_eq!(second.content, "second");
+
+        assert!(items.next().is_none());
+    }
+
+    #[test]
+    fn less_parsed_items_nonspan_final() {
+        let input = "<span>first</span> final";
+        let ts_duration_list = vec![
+            (0.seconds(), 1.seconds()),
+            (1.seconds(), 2.seconds()),
+            (4.seconds(), 3.seconds()),
+        ];
+
+        let mut items = span_tokenize_items(input, ts_duration_list).into_iter();
+
+        let first = items.next().unwrap();
+        assert_eq!(first.pts, 0.seconds());
+        assert_eq!(first.duration, 1.seconds());
+        assert_eq!(first.content, "first");
+
+        let final_ = items.next().unwrap();
+        assert_eq!(final_.pts, 1.seconds());
+        assert_eq!(final_.duration, 6.seconds());
+        assert_eq!(final_.content, "final");
+
+        assert!(items.next().is_none());
+    }
+
+    #[test]
+    fn utf8_input() {
+        let input = "caractères accentués";
+        let ts_duration_list = vec![(0.seconds(), 1.seconds())];
+
+        let mut items = span_tokenize_items(input, ts_duration_list).into_iter();
+
+        let first = items.next().unwrap();
+        assert_eq!(first.pts, 0.seconds());
+        assert_eq!(first.duration, 1.seconds());
+        assert_eq!(first.content, "caractères accentués");
+
+        assert!(items.next().is_none());
+    }
+}