diff --git a/docs/plugins/gst_plugins_cache.json b/docs/plugins/gst_plugins_cache.json index c6e6e16b..982e0d02 100644 --- a/docs/plugins/gst_plugins_cache.json +++ b/docs/plugins/gst_plugins_cache.json @@ -650,6 +650,12 @@ "direction": "src", "presence": "request", "type": "GstTranslationSrcPad" + }, + "translation_src_%%u": { + "caps": "text/x-raw:\n format: utf8\n", + "direction": "src", + "presence": "request", + "type": "GstTranslationSrcPad" } }, "properties": { @@ -773,7 +779,7 @@ "construct": false, "construct-only": false, "controllable": false, - "default": "3000", + "default": "5000", "max": "-1", "min": "0", "mutable": "ready", @@ -858,6 +864,21 @@ } ] }, + "GstAwsTranscriberTranslationTokenizationMethod": { + "kind": "enum", + "values": [ + { + "desc": "None: don't tokenize translations", + "name": "none", + "value": "0" + }, + { + "desc": "Span based: insert spans in the transript text and use the resulting spans in the translations to reproduce speech pacing.", + "name": "span-based", + "value": "1" + } + ] + }, "GstAwsTranscriberVocabularyFilterMethod": { "kind": "enum", "values": [ @@ -919,6 +940,18 @@ "readable": true, "type": "gchararray", "writable": true + }, + "tokenization-method": { + "blurb": "The tokenization method to apply to translations", + "conditionally-available": false, + "construct": false, + "construct-only": false, + "controllable": false, + "default": "none (0)", + "mutable": "ready", + "readable": true, + "type": "GstAwsTranscriberTranslationTokenizationMethod", + "writable": true } } } diff --git a/net/aws/src/transcriber/imp.rs b/net/aws/src/transcriber/imp.rs index d856e6d4..b8238dff 100644 --- a/net/aws/src/transcriber/imp.rs +++ b/net/aws/src/transcriber/imp.rs @@ -35,7 +35,10 @@ use once_cell::sync::Lazy; use super::transcribe::{TranscriberLoop, TranscriptEvent, TranscriptItem, TranscriptionSettings}; use super::translate::{TranslatedItem, TranslationLoop, TranslationQueue}; -use super::{AwsTranscriberResultStability, AwsTranscriberVocabularyFilterMethod, CAT}; +use super::{ + AwsTranscriberResultStability, AwsTranscriberVocabularyFilterMethod, + TranslationTokenizationMethod, CAT, +}; static RUNTIME: Lazy = Lazy::new(|| { runtime::Builder::new_multi_thread() @@ -73,6 +76,8 @@ pub const GRANULARITY: gst::ClockTime = gst::ClockTime::from_mseconds(100); const OUTPUT_LANG_CODE_PROPERTY: &str = "language-code"; const DEFAULT_OUTPUT_LANG_CODE: Option<&str> = None; +const TRANSLATION_TOKENIZATION_PROPERTY: &str = "tokenization-method"; + #[derive(Debug, Clone)] pub(super) struct Settings { transcribe_latency: gst::ClockTime, @@ -850,8 +855,8 @@ struct TranslationPadTask { needs_translate: bool, translation_queue: TranslationQueue, translation_loop_handle: Option>>, - to_translation_tx: Option>, - from_translation_rx: Option>, + to_translation_tx: Option>>, + from_translation_rx: Option>>, translate_latency: gst::ClockTime, transcript_lookahead: gst::ClockTime, send_events: bool, @@ -991,14 +996,14 @@ impl TranslationPadTask { // before current latency budget is exhausted. futures::select_biased! { _ = timeout => return Ok(()), - translated_item = from_translation_rx.next() => { - let Some(translated_item) = translated_item else { + translated_items = from_translation_rx.next() => { + let Some(translated_items) = translated_items else { const ERR: &str = "translation chan terminated"; gst::debug!(CAT, imp: self.pad, "{ERR}"); return Err(gst::error_msg!(gst::StreamError::Failed, ["{ERR}"])); }; - self.translated_items.push_back(translated_item); + self.translated_items.extend(translated_items); self.pending_translations = self.pending_translations.saturating_sub(1); return Ok(()); @@ -1027,9 +1032,9 @@ impl TranslationPadTask { } }; - for item in transcript_items.iter() { - if let Some(ready_item) = self.translation_queue.push(item) { - self.send_for_translation(ready_item).await?; + for items in transcript_items.iter() { + if let Some(ready_items) = self.translation_queue.push(items) { + self.send_for_translation(ready_items).await?; } } @@ -1072,19 +1077,12 @@ impl TranslationPadTask { let deadline = translation_eta.saturating_sub(max_delay); - if let Some(ready_item) = self + if let Some(ready_items) = self .translation_queue .dequeue(deadline, self.transcript_lookahead) { - gst::debug!( - CAT, - imp: self.pad, - "Forcing transcript at pts {} with duration {} to translation", - ready_item.pts, - ready_item.duration, - ); - - if self.send_for_translation(ready_item).await.is_err() { + gst::debug!(CAT, imp: self.pad, "Forcing {} transcripts to translation", ready_items.len()); + if self.send_for_translation(ready_items).await.is_err() { return false; } } @@ -1240,13 +1238,13 @@ impl TranslationPadTask { async fn send_for_translation( &mut self, - transcript_item: TranscriptItem, + transcript_items: Vec, ) -> Result<(), gst::ErrorMessage> { let res = self .to_translation_tx .as_mut() .expect("to_translation chan must be available in translation mode") - .send(transcript_item) + .send(transcript_items) .await; if res.is_err() { @@ -1346,6 +1344,7 @@ impl TranslationPadTask { &self.pad, &elem_settings.language_code, pad_settings.language_code.as_deref().unwrap(), + pad_settings.tokenization_method, to_translation_rx, from_translation_tx, )); @@ -1384,6 +1383,7 @@ impl Default for TranslationPadState { #[derive(Debug, Default, Clone)] struct TranslationPadSettings { language_code: Option, + tokenization_method: TranslationTokenizationMethod, } #[derive(Debug, Default)] @@ -1566,12 +1566,20 @@ impl ObjectSubclass for TranslationSrcPad { impl ObjectImpl for TranslationSrcPad { fn properties() -> &'static [glib::ParamSpec] { static PROPERTIES: Lazy> = Lazy::new(|| { - vec![glib::ParamSpecString::builder(OUTPUT_LANG_CODE_PROPERTY) - .nick("Language Code") - .blurb("The Language the Stream must be translated to") - .default_value(DEFAULT_OUTPUT_LANG_CODE) - .mutable_ready() - .build()] + vec![ + glib::ParamSpecString::builder(OUTPUT_LANG_CODE_PROPERTY) + .nick("Language Code") + .blurb("The Language the Stream must be translated to") + .default_value(DEFAULT_OUTPUT_LANG_CODE) + .mutable_ready() + .build(), + glib::ParamSpecEnum::builder(TRANSLATION_TOKENIZATION_PROPERTY) + .nick("Translations tokenization method") + .blurb("The tokenization method to apply to translations") + .default_value(TranslationTokenizationMethod::default()) + .mutable_ready() + .build(), + ] }); PROPERTIES.as_ref() @@ -1582,6 +1590,9 @@ impl ObjectImpl for TranslationSrcPad { OUTPUT_LANG_CODE_PROPERTY => { self.settings.lock().unwrap().language_code = value.get().unwrap() } + TRANSLATION_TOKENIZATION_PROPERTY => { + self.settings.lock().unwrap().tokenization_method = value.get().unwrap() + } _ => unimplemented!(), } } @@ -1589,6 +1600,9 @@ impl ObjectImpl for TranslationSrcPad { fn property(&self, _id: usize, pspec: &glib::ParamSpec) -> glib::Value { match pspec.name() { OUTPUT_LANG_CODE_PROPERTY => self.settings.lock().unwrap().language_code.to_value(), + TRANSLATION_TOKENIZATION_PROPERTY => { + self.settings.lock().unwrap().tokenization_method.to_value() + } _ => unimplemented!(), } } diff --git a/net/aws/src/transcriber/mod.rs b/net/aws/src/transcriber/mod.rs index eb2a28f7..faad2748 100644 --- a/net/aws/src/transcriber/mod.rs +++ b/net/aws/src/transcriber/mod.rs @@ -79,6 +79,21 @@ impl From for VocabularyFilterMethod { } } +#[derive(Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash, Clone, Copy, glib::Enum)] +#[repr(u32)] +#[enum_type(name = "GstAwsTranscriberTranslationTokenizationMethod")] +#[non_exhaustive] +pub enum TranslationTokenizationMethod { + #[default] + #[enum_value(name = "None: don't tokenize translations", nick = "none")] + None = 0, + #[enum_value( + name = "Span based: insert spans in the transript text and use the resulting spans in the translations to reproduce speech pacing.", + nick = "span-based" + )] + SpanBased = 1, +} + glib::wrapper! { pub struct Transcriber(ObjectSubclass) @extends gst::Element, gst::Object, @implements gst::ChildProxy; } @@ -94,6 +109,8 @@ pub fn register(plugin: &gst::Plugin) -> Result<(), glib::BoolError> { .mark_as_plugin_api(gst::PluginAPIFlags::empty()); AwsTranscriberVocabularyFilterMethod::static_type() .mark_as_plugin_api(gst::PluginAPIFlags::empty()); + TranslationTokenizationMethod::static_type() + .mark_as_plugin_api(gst::PluginAPIFlags::empty()); TranslationSrcPad::static_type().mark_as_plugin_api(gst::PluginAPIFlags::empty()); } gst::Element::register( diff --git a/net/aws/src/transcriber/transcribe.rs b/net/aws/src/transcriber/transcribe.rs index 7b683f3b..97301380 100644 --- a/net/aws/src/transcriber/transcribe.rs +++ b/net/aws/src/transcriber/transcribe.rs @@ -69,18 +69,6 @@ impl TranscriptItem { is_punctuation: matches!(item.r#type, Some(model::ItemType::Punctuation)), }) } - - #[inline] - pub fn push(&mut self, item: &TranscriptItem) { - self.duration += item.duration; - - self.is_punctuation &= item.is_punctuation; - if !item.is_punctuation { - self.content.push(' '); - } - - self.content.push_str(&item.content); - } } #[derive(Clone)] diff --git a/net/aws/src/transcriber/translate.rs b/net/aws/src/transcriber/translate.rs index b689bd63..fc49674d 100644 --- a/net/aws/src/transcriber/translate.rs +++ b/net/aws/src/transcriber/translate.rs @@ -18,8 +18,12 @@ use std::collections::VecDeque; use super::imp::TranslationSrcPad; use super::transcribe::TranscriptItem; -use super::CAT; +use super::{TranslationTokenizationMethod, CAT}; +const SPAN_START: &str = ""; +const SPAN_END: &str = ""; + +#[derive(Debug)] pub struct TranslatedItem { pub pts: gst::ClockTime, pub duration: gst::ClockTime, @@ -49,7 +53,7 @@ impl TranslationQueue { /// Pushes the provided item. /// /// Returns `Some(..)` if items are ready for translation. - pub fn push(&mut self, transcript_item: &TranscriptItem) -> Option { + pub fn push(&mut self, transcript_item: &TranscriptItem) -> Option> { // Keep track of the item individually so we can schedule translation precisely. self.items.push_back(transcript_item.clone()); @@ -57,16 +61,7 @@ impl TranslationQueue { // This makes it a good chunk for translation. // Concatenate as a single item for translation - let mut items = self.items.drain(..); - - let mut item_acc = items.next()?; - for item in items { - item_acc.push(&item); - } - - item_acc.push(transcript_item); - - return Some(item_acc); + return Some(self.items.drain(..).collect()); } // Regular case: no separator detected, don't push transcript items @@ -78,12 +73,12 @@ impl TranslationQueue { /// Dequeues items from the specified `deadline` up to `lookahead`. /// - /// Returns `Some(..)` with the accumulated items matching the criteria. + /// Returns `Some(..)` if some items match the criteria. pub fn dequeue( &mut self, deadline: gst::ClockTime, lookahead: gst::ClockTime, - ) -> Option { + ) -> Option> { if self.items.front()?.pts < deadline { // First item is too early to be sent to translation now // we can wait for more items to accumulate. @@ -94,17 +89,16 @@ impl TranslationQueue { // Try to get up to lookahead more items to improve translation accuracy let limit = deadline + lookahead; - let mut item_acc = self.items.pop_front().unwrap(); + let mut items_acc = vec![self.items.pop_front().unwrap()]; while let Some(item) = self.items.front() { if item.pts > limit { break; } - let item = self.items.pop_front().unwrap(); - item_acc.push(&item); + items_acc.push(self.items.pop_front().unwrap()); } - Some(item_acc) + Some(items_acc) } } @@ -113,8 +107,9 @@ pub struct TranslationLoop { client: aws_translate::Client, input_lang: String, output_lang: String, - transcript_rx: mpsc::Receiver, - translation_tx: mpsc::Sender, + tokenization_method: TranslationTokenizationMethod, + transcript_rx: mpsc::Receiver>, + translation_tx: mpsc::Sender>, } impl TranslationLoop { @@ -123,8 +118,9 @@ impl TranslationLoop { pad: &TranslationSrcPad, input_lang: &str, output_lang: &str, - transcript_rx: mpsc::Receiver, - translation_tx: mpsc::Sender, + tokenization_method: TranslationTokenizationMethod, + transcript_rx: mpsc::Receiver>, + translation_tx: mpsc::Sender>, ) -> Self { let aws_config = imp.aws_config.lock().unwrap(); let aws_config = aws_config @@ -136,6 +132,7 @@ impl TranslationLoop { client: aws_sdk_translate::Client::new(aws_config), input_lang: input_lang.to_string(), output_lang: output_lang.to_string(), + tokenization_method, transcript_rx, translation_tx, } @@ -167,40 +164,70 @@ impl TranslationLoop { } pub async fn run(mut self) -> Result<(), gst::ErrorMessage> { - while let Some(transcript_item) = self.transcript_rx.next().await { - let TranscriptItem { - pts, - duration, - content, - .. - } = transcript_item; + use TranslationTokenizationMethod as Tokenization; - let translated_text = if content.is_empty() { - content - } else { - self.client - .translate_text() - .set_source_language_code(Some(self.input_lang.clone())) - .set_target_language_code(Some(self.output_lang.clone())) - .set_text(Some(content)) - .send() - .await - .map_err(|err| { - let err = format!("Failed to call translation service: {err}"); - gst::info!(CAT, imp: self.pad, "{err}"); - gst::error_msg!(gst::LibraryError::Failed, ["{err}"]) - })? - .translated_text - .unwrap_or_default() + while let Some(transcript_items) = self.transcript_rx.next().await { + if transcript_items.is_empty() { + continue; + } + + let (ts_duration_list, content): (Vec<(gst::ClockTime, gst::ClockTime)>, String) = + transcript_items + .into_iter() + .map(|item| { + ( + (item.pts, item.duration), + match self.tokenization_method { + Tokenization::None => item.content, + Tokenization::SpanBased => { + format!("{SPAN_START}{}{SPAN_END}", item.content) + } + }, + ) + }) + .unzip(); + + gst::trace!(CAT, imp: self.pad, "Translating {content} with {ts_duration_list:?}"); + + let translated_text = self + .client + .translate_text() + .set_source_language_code(Some(self.input_lang.clone())) + .set_target_language_code(Some(self.output_lang.clone())) + .set_text(Some(content)) + .send() + .await + .map_err(|err| { + let err = format!("Failed to call translation service: {err}"); + gst::info!(CAT, imp: self.pad, "{err}"); + gst::error_msg!(gst::LibraryError::Failed, ["{err}"]) + })? + .translated_text + .unwrap_or_default(); + + gst::trace!(CAT, imp: self.pad, "Got translation {translated_text}"); + + let translated_items = match self.tokenization_method { + Tokenization::None => { + // Push translation as a single item + let mut ts_duration_iter = ts_duration_list.into_iter().peekable(); + + let &(first_pts, _) = ts_duration_iter.peek().expect("at least one item"); + let (last_pts, last_duration) = + ts_duration_iter.last().expect("at least one item"); + + vec![TranslatedItem { + pts: first_pts, + duration: last_pts.saturating_sub(first_pts) + last_duration, + content: translated_text, + }] + } + Tokenization::SpanBased => span_tokenize_items(&translated_text, ts_duration_list), }; - let translated_item = TranslatedItem { - pts, - duration, - content: translated_text, - }; + gst::trace!(CAT, imp: self.pad, "Sending {translated_items:?}"); - if self.translation_tx.send(translated_item).await.is_err() { + if self.translation_tx.send(translated_items).await.is_err() { gst::info!( CAT, imp: self.pad, @@ -213,3 +240,374 @@ impl TranslationLoop { Ok(()) } } + +/// Parses translated items from the `translation` `String` using `span` tags. +/// +/// The `translation` is expected to have been returned by the `Translate` ws. +/// It can contain id-less `` and `` tags, matching similar +/// id-less tags from the content submitted to the `Translate` ws. +/// +/// This parser accepts both serial `` as well as nested +/// ``. +/// +/// The parsed items are assigned the ts and duration from `ts_duration_list` +/// in their order of appearance. +/// +/// If more parsed items are found, the last item will concatenate the remaining items. +/// +/// If less parsed items are found, the last item will be assign the remaining +/// duration from the `ts_duration_list`. +fn span_tokenize_items( + translation: &str, + ts_duration_list: impl IntoIterator, +) -> Vec { + const SPAN_START_LEN: usize = SPAN_START.len(); + const SPAN_END_LEN: usize = SPAN_END.len(); + + let mut translated_items = vec![]; + + let mut ts_duration_iter = ts_duration_list.into_iter(); + + // Content for a translated item + let mut content = String::new(); + + // Alleged span chunk + let mut chunk = String::new(); + + for c in translation.chars() { + if content.is_empty() && c.is_whitespace() { + // ignore leading whitespaces + continue; + } + + if chunk.is_empty() { + if c == '<' { + // Start an alleged span chunk + chunk.push(c); + } else { + content.push(c); + } + + continue; + } + + chunk.push(c); + + match chunk.len() { + len if len < SPAN_START_LEN => continue, + SPAN_START_LEN => { + if chunk != SPAN_START { + continue; + } + // Got a + } + SPAN_END_LEN => { + if chunk != SPAN_END { + continue; + } + // Got a + } + _ => { + // Can no longer be a span + content.extend(chunk.drain(..)); + continue; + } + } + + // got a span + chunk.clear(); + + if content.is_empty() { + continue; + } + + // Add pending content + // assign it the next pts and duration from the input list + if let Some((pts, duration)) = ts_duration_iter.next() { + translated_items.push(TranslatedItem { + pts, + duration, + content, + }); + + content = String::new(); + } else if let Some(last_item) = translated_items.last_mut() { + // exhausted available pts and duration + // add content to last item + if !last_item.content.ends_with(' ') { + last_item.content.push(' '); + } + last_item.content.extend(content.drain(..)); + } + } + + content.extend(chunk.drain(..)); + + if !content.is_empty() { + // Add last content + if let Some((pts, mut duration)) = ts_duration_iter.next() { + if let Some((last_pts, last_duration)) = ts_duration_iter.last() { + // Fix remaining duration + duration = last_pts.saturating_sub(pts) + last_duration; + } + + translated_items.push(TranslatedItem { + pts, + duration, + content, + }); + } else if let Some(last_item) = translated_items.last_mut() { + // No more pts and duration in the index + // Add remaining content to the last item pushed + if !last_item.content.ends_with(' ') { + last_item.content.push(' '); + } + last_item.content.push_str(&content); + } + } else if let Some((last_pts, last_duration)) = ts_duration_iter.last() { + if let Some(last_item) = translated_items.last_mut() { + // No more content, but need to fix last item's duration + last_item.duration = last_pts.saturating_sub(last_item.pts) + last_duration; + } + } + + translated_items +} + +#[cfg(test)] +mod tests { + use super::span_tokenize_items; + use gst::prelude::*; + + #[test] + fn serial_spans() { + let input = "first second third"; + let ts_duration_list = vec![ + (0.seconds(), 1.seconds()), + (1.seconds(), 2.seconds()), + (4.seconds(), 3.seconds()), + ]; + + let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); + + let first = items.next().unwrap(); + assert_eq!(first.pts, 0.seconds()); + assert_eq!(first.duration, 1.seconds()); + assert_eq!(first.content, "first"); + + let second = items.next().unwrap(); + assert_eq!(second.pts, 1.seconds()); + assert_eq!(second.duration, 2.seconds()); + assert_eq!(second.content, "second"); + + let third = items.next().unwrap(); + assert_eq!(third.pts, 4.seconds()); + assert_eq!(third.duration, 3.seconds()); + assert_eq!(third.content, "third"); + + assert!(items.next().is_none()); + } + + #[test] + fn serial_and_nested_spans() { + let input = "first second third fourth"; + let ts_duration_list = vec![ + (0.seconds(), 1.seconds()), + (1.seconds(), 2.seconds()), + (3.seconds(), 1.seconds()), + (4.seconds(), 2.seconds()), + ]; + + let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); + + let first = items.next().unwrap(); + assert_eq!(first.pts, 0.seconds()); + assert_eq!(first.duration, 1.seconds()); + assert_eq!(first.content, "first"); + + let second = items.next().unwrap(); + assert_eq!(second.pts, 1.seconds()); + assert_eq!(second.duration, 2.seconds()); + assert_eq!(second.content, "second "); + + let third = items.next().unwrap(); + assert_eq!(third.pts, 3.seconds()); + assert_eq!(third.duration, 1.seconds()); + assert_eq!(third.content, "third"); + + let fourth = items.next().unwrap(); + assert_eq!(fourth.pts, 4.seconds()); + assert_eq!(fourth.duration, 2.seconds()); + assert_eq!(fourth.content, "fourth"); + + assert!(items.next().is_none()); + } + + #[test] + fn nonspaned_serial_and_nested_spans() { + let input = "Initial first second third fourth final"; + let ts_duration_list = vec![ + (0.seconds(), 1.seconds()), + (1.seconds(), 1.seconds()), + (2.seconds(), 1.seconds()), + (3.seconds(), 1.seconds()), + (4.seconds(), 1.seconds()), + (5.seconds(), 1.seconds()), + ]; + + let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); + + let init = items.next().unwrap(); + assert_eq!(init.pts, 0.seconds()); + assert_eq!(init.duration, 1.seconds()); + assert_eq!(init.content, "Initial "); + + let first = items.next().unwrap(); + assert_eq!(first.pts, 1.seconds()); + assert_eq!(first.duration, 1.seconds()); + assert_eq!(first.content, "first"); + + let second = items.next().unwrap(); + assert_eq!(second.pts, 2.seconds()); + assert_eq!(second.duration, 1.seconds()); + assert_eq!(second.content, "second "); + + let third = items.next().unwrap(); + assert_eq!(third.pts, 3.seconds()); + assert_eq!(third.duration, 1.seconds()); + assert_eq!(third.content, "third"); + + let fourth = items.next().unwrap(); + assert_eq!(fourth.pts, 4.seconds()); + assert_eq!(fourth.duration, 1.seconds()); + assert_eq!(fourth.content, "fourth"); + + let final_ = items.next().unwrap(); + assert_eq!(final_.pts, 5.seconds()); + assert_eq!(final_.duration, 1.seconds()); + assert_eq!(final_.content, "final"); + + assert!(items.next().is_none()); + } + + #[test] + fn more_parsed_items() { + let input = "first second third fourth"; + let ts_duration_list = vec![ + (0.seconds(), 1.seconds()), + (1.seconds(), 2.seconds()), + (4.seconds(), 3.seconds()), + ]; + + let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); + + let first = items.next().unwrap(); + assert_eq!(first.pts, 0.seconds()); + assert_eq!(first.duration, 1.seconds()); + assert_eq!(first.content, "first"); + + let second = items.next().unwrap(); + assert_eq!(second.pts, 1.seconds()); + assert_eq!(second.duration, 2.seconds()); + assert_eq!(second.content, "second"); + + let third = items.next().unwrap(); + assert_eq!(third.pts, 4.seconds()); + assert_eq!(third.duration, 3.seconds()); + assert_eq!(third.content, "third fourth"); + + assert!(items.next().is_none()); + } + + #[test] + fn more_parsed_items_nonspan_final() { + let input = "first second third final"; + let ts_duration_list = vec![ + (0.seconds(), 1.seconds()), + (1.seconds(), 2.seconds()), + (4.seconds(), 3.seconds()), + ]; + + let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); + + let first = items.next().unwrap(); + assert_eq!(first.pts, 0.seconds()); + assert_eq!(first.duration, 1.seconds()); + assert_eq!(first.content, "first"); + + let second = items.next().unwrap(); + assert_eq!(second.pts, 1.seconds()); + assert_eq!(second.duration, 2.seconds()); + assert_eq!(second.content, "second"); + + let third = items.next().unwrap(); + assert_eq!(third.pts, 4.seconds()); + assert_eq!(third.duration, 3.seconds()); + assert_eq!(third.content, "third final"); + + assert!(items.next().is_none()); + } + + #[test] + fn less_parsed_items() { + let input = "first second"; + let ts_duration_list = vec![ + (0.seconds(), 1.seconds()), + (1.seconds(), 2.seconds()), + (4.seconds(), 3.seconds()), + ]; + + let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); + + let first = items.next().unwrap(); + assert_eq!(first.pts, 0.seconds()); + assert_eq!(first.duration, 1.seconds()); + assert_eq!(first.content, "first"); + + let second = items.next().unwrap(); + assert_eq!(second.pts, 1.seconds()); + assert_eq!(second.duration, 6.seconds()); + assert_eq!(second.content, "second"); + + assert!(items.next().is_none()); + } + + #[test] + fn less_parsed_items_nonspan_final() { + let input = "first final"; + let ts_duration_list = vec![ + (0.seconds(), 1.seconds()), + (1.seconds(), 2.seconds()), + (4.seconds(), 3.seconds()), + ]; + + let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); + + let first = items.next().unwrap(); + assert_eq!(first.pts, 0.seconds()); + assert_eq!(first.duration, 1.seconds()); + assert_eq!(first.content, "first"); + + let final_ = items.next().unwrap(); + assert_eq!(final_.pts, 1.seconds()); + assert_eq!(final_.duration, 6.seconds()); + assert_eq!(final_.content, "final"); + + assert!(items.next().is_none()); + } + + #[test] + fn utf8_input() { + let input = "caractères accentués"; + let ts_duration_list = vec![(0.seconds(), 1.seconds())]; + + let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); + + let first = items.next().unwrap(); + assert_eq!(first.pts, 0.seconds()); + assert_eq!(first.duration, 1.seconds()); + assert_eq!(first.content, "caractères accentués"); + + assert!(items.next().is_none()); + } +}