google-cloud-playground/src/main.rs

use bytes::BytesMut;
use google_api_proto::google::cloud::speech::v1::streaming_recognize_request::StreamingRequest;
use google_api_proto::google::cloud::speech::v1::{
    recognition_config::AudioEncoding, speech_client::SpeechClient, RecognitionConfig,
    StreamingRecognitionConfig, StreamingRecognizeRequest,
};
use google_authz::{Credentials, GoogleAuthz};
use log::{debug, info};
use tokio::io::AsyncReadExt;
use tokio_stream::wrappers::UnboundedReceiverStream;
use tonic::transport::Channel;
use tracing::Instrument;

#[tokio::main]
async fn main() -> eyre::Result<()> {
    tracing_subscriber::fmt::init();
    // console_subscriber::init();

    debug!("starting...");

    let speech_api_channel = Channel::from_static("https://speech.googleapis.com")
        .connect()
        .await?;

    let credentials = Credentials::builder()
        .json_file("i-centralvideo-dictate-dev-c184dd68967a.json".as_ref())
        .build()
        .await?;
    let auth_channel = GoogleAuthz::builder(speech_api_channel)
        .credentials(credentials)
        .build()
        .await;

    debug!("authenticated channel created!");

    let mut client = SpeechClient::new(auth_channel);


    let (sender, receiver) = tokio::sync::mpsc::unbounded_channel();

    sender.send(StreamingRecognizeRequest {
        streaming_request: Some(StreamingRequest::StreamingConfig(
            StreamingRecognitionConfig {
                config: Some(RecognitionConfig {
                    encoding: AudioEncoding::Flac.into(), // matching current example file
                    sample_rate_hertz: 44_100,            // matching current example file
                    audio_channel_count: 2,
                    language_code: "en-US".to_string(),   // we only support en-US to start with
                    model: "video".to_string(),           // dictate does not set this option
                    use_enhanced: true,                   // dictate does not set this option
                    profanity_filter: true,               // used by Dictate, so we also use it here
                    enable_word_time_offsets: true, // important so we can get the spoken word time ranges
                    max_alternatives: 1,            // make sure the default is used
                    ..Default::default()
                }),
                single_utterance: false,
                interim_results: false,
            },
        )),
    })?;

    tokio::spawn(async move {
        let file = tokio::fs::File::open("some-audio.flac").await.unwrap();
        let mut audio_file = tokio::io::BufReader::new(file);
        // read file chunk
        let mut buffer = [0; 1024 * 50];
        while let Ok(n) = audio_file.read(&mut buffer[..]).await {
            // send to server
            let request = StreamingRecognizeRequest {
                streaming_request: Some(StreamingRequest::AudioContent(
                    BytesMut::from(&buffer.as_slice()[..n]).freeze(),
                )),
            };
            let result = sender.send(request);
            //debug!("added a buffer to the sender queue: {} bytes", n);
            tokio::time::sleep(std::time::Duration::from_millis(100)).await;
        }
    });

    let response = client
        .streaming_recognize(UnboundedReceiverStream::new(receiver))
        .await?;
    let mut inbound = response.into_inner();

    while let Some(response) = inbound.message().instrument(tracing::info_span!("transcription-results")).await? {
        let mut num_results = 0;
        for res in &response.results {
            if res.is_final {
                num_results = num_results + 1;
                info!("Result {} {{", num_results);

                if let Some(rec) = res.alternatives.first() {
                    info!("\tTranscription: {}", rec.transcript);
                    for word_info in &rec.words {
                        // let start_time: WordTimestamp = word_info.start_time.into();
                        let start_time = word_info.start_time.as_ref().unwrap();
                        let end_time = word_info.end_time.as_ref().unwrap();
                        info!(
                            "\t - {}: [{}.{} - {}.{}]",
                            word_info.word,
                            start_time.seconds,
                            start_time.nanos,
                            end_time.seconds,
                            end_time.nanos
                        );
                    }
                }
                info!("}}");
            }
        }
    }

    Ok(())
}