diff --git a/Cargo.toml b/Cargo.toml index b11b925..81541b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ build = "build.rs" [dependencies] gst = { package = "gstreamer", version = "0.18" } -gstreamer-base = "0.18" +gst-base = { package = "gstreamer-base", version = "0.18" } once_cell = "1" atomic_refcell = "0.1" serde = "1" @@ -21,7 +21,7 @@ tokio = { version = "1.0", features = [ "rt-multi-thread", "time" ] } async-tungstenite = { version = "0.17", features = ["tokio", "tokio-runtime", "tokio-native-tls"] } [build-dependencies] -gst-plugin-version-helper = "0.7.3" +gst-plugin-version-helper = "0.7" [lib] name = "gstvosk" diff --git a/README.md b/README.md index de88dbc..5a85fdf 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,48 @@ -Vosk Speech Recognition GStreamer Plugin -======================================== +# Vosk Speech Recognition GStreamer Plugin Transcription of speech using [Vosk Toolkit](https://alphacephei.com/vosk/). Can be used to generate subtitles for -videos, transcription of audio notes, etc. +movies, live streams, lectures and interviews. -Usage ------ +> Vosk is an offline open source speech recognition toolkit. It enables speech recognition for 20+ languages and +> dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese, +> Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, Ukrainian, Kazakh, Swedish, Japanese, Esperanto, Hindi. +> More to come. +> +> https://github.com/alphacep/vosk-api + +This GStreamer plugin was inspired by the work of [@MathieuDuponchelle](https://github.com/mathieuduponchelle) in the +[AwsTranscriber](https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs/-/tree/main/net/rusoto#awstranscriber) element. + +## Build + +Compiling this project will provide a shared library that can be used by your local GStreamer installation. ```bash -GST_DEBUG=1,vosk_transcriber:5 gst-launch-1.0 filesrc location=/Users/rafaelcaricio/astronaut.mkv ! matroskademux name=d d.audio_0 ! decodebin ! audiorate ! audioconvert ! audioresample ! audio/x-raw,format=S16LE,rate=48000,channels=1 ! vosk_transcriber server-address=ws://192.168.178.20:2700 ! fakesink dump=true --gst-plugin-path=/Users/rafaelcaricio/development/gst-plugin-vosk/target/release/ -``` \ No newline at end of file +cargo build --release +``` + +The compiled shared library `./target/release/libgstvosk.dylib` must be made loadable to GStreamer. One possible +solution is to use the argument `--gst-plugin-path=` pointing to the location where the library file is every time you +run `gst-launch-1.0` command line tool. + + +## Example Usage + +This plugin connects via websockets protocol to the [Vosk Server](https://alphacephei.com/vosk/server). The easiest +way to run the Vosk server is using [Docker](https://docs.docker.com/). You can run the server locally using +this command: + +```bash +docker run --rm --name vosk-server -d -p 2700:2700 alphacep/kaldi-en:latest +``` + +Running the recognition server as a separated process comes with the additional benefit that you don't need to +install any special software. Plus the voice recognition work load is off your GStreamer pipeline process. + +This example will just print out the raw text buffers that are published out by the Vosk transcriber: + +```bash +gst-launch-1.0 \ + vosk_transcriber name=tc ! fakesink sync=true dump=true \ + uridecodebin uri=https://studio.blender.org/download-source/d1/d1f3b354a8f741c6afabf305489fa510/d1f3b354a8f741c6afabf305489fa510-1080p.mp4 ! audioconvert ! tc. +``` diff --git a/src/transcriber/imp.rs b/src/transcriber/imp.rs index 075fde0..2668bc5 100644 --- a/src/transcriber/imp.rs +++ b/src/transcriber/imp.rs @@ -46,6 +46,8 @@ static RUNTIME: Lazy = Lazy::new(|| { const DEFAULT_LATENCY: gst::ClockTime = gst::ClockTime::from_seconds(30); const DEFAULT_SERVER_ADDRESS: &str = "ws://localhost:2700"; +const DEFAULT_MIN_CONFIDENCE_THRESHOLD: f64 = 0.7; + const GRANULARITY: gst::ClockTime = gst::ClockTime::from_mseconds(100); #[derive(Debug, Clone)] @@ -55,6 +57,9 @@ struct Settings { /// The address of the gRPC server to connect to for transcription. server_address: String, + + /// Transcription confidence threshold. Anything below this will be ignored. + min_confidence_threshold: f64, } impl Default for Settings { @@ -62,6 +67,7 @@ impl Default for Settings { Settings { latency: DEFAULT_LATENCY, server_address: DEFAULT_SERVER_ADDRESS.to_string(), + min_confidence_threshold: DEFAULT_MIN_CONFIDENCE_THRESHOLD, } } } @@ -291,7 +297,13 @@ impl Transcriber { state: &mut State, transcription: &Vec, ) { + let min_confidence_threshold = self.settings.lock().unwrap().min_confidence_threshold; for item in transcription.iter() { + // Skip items with a confidence below the threshold + if item.confidence < min_confidence_threshold { + continue; + } + let start_time = gst::ClockTime::from_nseconds((item.start * 1_000_000_000.0) as u64); let end_time = gst::ClockTime::from_nseconds((item.end * 1_000_000_000.0) as u64); @@ -950,6 +962,15 @@ impl ObjectImpl for Transcriber { Some(DEFAULT_SERVER_ADDRESS), glib::ParamFlags::READWRITE | gst::PARAM_FLAG_MUTABLE_READY, ), + glib::ParamSpecDouble::new( + "min-confidence", + "Minimum Confidence", + "Transcription minimum confidence threshold. Anything below this will be ignored.", + 0.0, + 1.0, + DEFAULT_MIN_CONFIDENCE_THRESHOLD, + glib::ParamFlags::READWRITE | gst::PARAM_FLAG_MUTABLE_READY, + ), ] }); @@ -982,6 +1003,10 @@ impl ObjectImpl for Transcriber { let mut settings = self.settings.lock().unwrap(); settings.server_address = value.get().expect("type checked upstream") } + "min-confidence" => { + let mut settings = self.settings.lock().unwrap(); + settings.min_confidence_threshold = value.get().expect("type checked upstream") + } _ => unimplemented!(), } } @@ -996,6 +1021,10 @@ impl ObjectImpl for Transcriber { let settings = self.settings.lock().unwrap(); settings.server_address.to_value() } + "min-confidence" => { + let settings = self.settings.lock().unwrap(); + settings.min_confidence_threshold.to_value() + } _ => unimplemented!(), } }