use bytes::BytesMut; use google_api_proto::google::cloud::speech::v1::streaming_recognize_request::StreamingRequest; use google_api_proto::google::cloud::speech::v1::{ recognition_config::AudioEncoding, speech_client::SpeechClient, RecognitionConfig, StreamingRecognitionConfig, StreamingRecognizeRequest, }; use google_authz::{Credentials, GoogleAuthz}; use log::{debug, info}; use tokio::io::AsyncReadExt; use tokio_stream::wrappers::UnboundedReceiverStream; use tonic::transport::Channel; use tracing::Instrument; #[tokio::main] async fn main() -> eyre::Result<()> { tracing_subscriber::fmt::init(); // console_subscriber::init(); debug!("starting..."); let speech_api_channel = Channel::from_static("https://speech.googleapis.com") .connect() .await?; let credentials = Credentials::builder() .json_file("i-centralvideo-dictate-dev-c184dd68967a.json".as_ref()) .build() .await?; let auth_channel = GoogleAuthz::builder(speech_api_channel) .credentials(credentials) .build() .await; debug!("authenticated channel created!"); let mut client = SpeechClient::new(auth_channel); let (sender, receiver) = tokio::sync::mpsc::unbounded_channel(); sender.send(StreamingRecognizeRequest { streaming_request: Some(StreamingRequest::StreamingConfig( StreamingRecognitionConfig { config: Some(RecognitionConfig { encoding: AudioEncoding::Flac.into(), // matching current example file sample_rate_hertz: 44_100, // matching current example file audio_channel_count: 2, language_code: "en-US".to_string(), // we only support en-US to start with model: "video".to_string(), // dictate does not set this option use_enhanced: true, // dictate does not set this option profanity_filter: true, // used by Dictate, so we also use it here enable_word_time_offsets: true, // important so we can get the spoken word time ranges max_alternatives: 1, // make sure the default is used ..Default::default() }), single_utterance: false, interim_results: false, }, )), })?; tokio::spawn(async move { let file = tokio::fs::File::open("some-audio.flac").await.unwrap(); let mut audio_file = tokio::io::BufReader::new(file); // read file chunk let mut buffer = [0; 1024 * 50]; while let Ok(n) = audio_file.read(&mut buffer[..]).await { // send to server let request = StreamingRecognizeRequest { streaming_request: Some(StreamingRequest::AudioContent( BytesMut::from(&buffer.as_slice()[..n]).freeze(), )), }; let result = sender.send(request); //debug!("added a buffer to the sender queue: {} bytes", n); tokio::time::sleep(std::time::Duration::from_millis(100)).await; } }); let response = client .streaming_recognize(UnboundedReceiverStream::new(receiver)) .await?; let mut inbound = response.into_inner(); while let Some(response) = inbound.message().instrument(tracing::info_span!("transcription-results")).await? { let mut num_results = 0; for res in &response.results { if res.is_final { num_results = num_results + 1; info!("Result {} {{", num_results); if let Some(rec) = res.alternatives.first() { info!("\tTranscription: {}", rec.transcript); for word_info in &rec.words { // let start_time: WordTimestamp = word_info.start_time.into(); let start_time = word_info.start_time.as_ref().unwrap(); let end_time = word_info.end_time.as_ref().unwrap(); info!( "\t - {}: [{}.{} - {}.{}]", word_info.word, start_time.seconds, start_time.nanos, end_time.seconds, end_time.nanos ); } } info!("}}"); } } } Ok(()) }