awstranscriber: fix what we send over for translations

Prior to this commit, we were sending over words concatenated together
with no separators, for instance "Idon'twanttobeanemperor".

The translation service seems clever enough to translate the contents
anyway, but there is no reason to make its task harder than necessary,
and it didn't re-add separators when the target language was the same as
the source language, which resulted in less than ideal output.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs/-/merge_requests/1171>
This commit is contained in:
Mathieu Duponchelle 2023-04-10 16:34:49 +02:00 committed by GStreamer Marge Bot
parent 408fd2030c
commit f366c20869

View file

@ -109,23 +109,34 @@ impl TranslateLoop {
continue;
}
let (ts_duration_list, content): (Vec<(gst::ClockTime, gst::ClockTime)>, String) =
transcript_items
.iter()
.map(|item| {
(
(item.pts, item.duration),
match self.tokenization_method {
Tokenization::None => item.content.clone(),
Tokenization::SpanBased => {
format!("{SPAN_START}{}{SPAN_END}", item.content)
}
},
)
})
.unzip();
let mut ts_duration_list: Vec<(gst::ClockTime, gst::ClockTime)> = vec![];
let mut content: Vec<String> = vec![];
gst::trace!(CAT, imp: self.pad, "Translating {content} with {ts_duration_list:?}");
let mut it = transcript_items.iter().peekable();
while let Some(item) = it.next() {
let suffix = match it.peek() {
Some(next_item) => {
if next_item.is_punctuation {
""
} else {
" "
}
}
None => "",
};
ts_duration_list.push((item.pts, item.duration));
content.push(match self.tokenization_method {
Tokenization::None => format!("{}{}", item.content, suffix),
Tokenization::SpanBased => {
format!("{SPAN_START}{}{SPAN_END}{}", item.content, suffix)
}
});
}
let content: String = content.join("");
gst::debug!(CAT, imp: self.pad, "Translating {content} with {ts_duration_list:?}");
let translated_text = self
.client
@ -143,7 +154,7 @@ impl TranslateLoop {
.translated_text
.unwrap_or_default();
gst::trace!(CAT, imp: self.pad, "Got translation {translated_text}");
gst::debug!(CAT, imp: self.pad, "Got translation {translated_text}");
let translated_items = match self.tokenization_method {
Tokenization::None => {