From f366c2086996b8afb906120d35c92c4b2f6af40d Mon Sep 17 00:00:00 2001 From: Mathieu Duponchelle Date: Mon, 10 Apr 2023 16:34:49 +0200 Subject: [PATCH] awstranscriber: fix what we send over for translations Prior to this commit, we were sending over words concatenated together with no separators, for instance "Idon'twanttobeanemperor". The translation service seems clever enough to translate the contents anyway, but there is no reason to make its task harder than necessary, and it didn't re-add separators when the target language was the same as the source language, which resulted in less than ideal output. Part-of: --- net/aws/src/transcriber/translate.rs | 45 +++++++++++++++++----------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/net/aws/src/transcriber/translate.rs b/net/aws/src/transcriber/translate.rs index 71f43aef..8fca532b 100644 --- a/net/aws/src/transcriber/translate.rs +++ b/net/aws/src/transcriber/translate.rs @@ -109,23 +109,34 @@ impl TranslateLoop { continue; } - let (ts_duration_list, content): (Vec<(gst::ClockTime, gst::ClockTime)>, String) = - transcript_items - .iter() - .map(|item| { - ( - (item.pts, item.duration), - match self.tokenization_method { - Tokenization::None => item.content.clone(), - Tokenization::SpanBased => { - format!("{SPAN_START}{}{SPAN_END}", item.content) - } - }, - ) - }) - .unzip(); + let mut ts_duration_list: Vec<(gst::ClockTime, gst::ClockTime)> = vec![]; + let mut content: Vec = vec![]; - gst::trace!(CAT, imp: self.pad, "Translating {content} with {ts_duration_list:?}"); + let mut it = transcript_items.iter().peekable(); + + while let Some(item) = it.next() { + let suffix = match it.peek() { + Some(next_item) => { + if next_item.is_punctuation { + "" + } else { + " " + } + } + None => "", + }; + ts_duration_list.push((item.pts, item.duration)); + content.push(match self.tokenization_method { + Tokenization::None => format!("{}{}", item.content, suffix), + Tokenization::SpanBased => { + format!("{SPAN_START}{}{SPAN_END}{}", item.content, suffix) + } + }); + } + + let content: String = content.join(""); + + gst::debug!(CAT, imp: self.pad, "Translating {content} with {ts_duration_list:?}"); let translated_text = self .client @@ -143,7 +154,7 @@ impl TranslateLoop { .translated_text .unwrap_or_default(); - gst::trace!(CAT, imp: self.pad, "Got translation {translated_text}"); + gst::debug!(CAT, imp: self.pad, "Got translation {translated_text}"); let translated_items = match self.tokenization_method { Tokenization::None => {