google-cloud-playground/src/main.rs

use bytes::BytesMut;
use google_api_proto::google::cloud::speech::v1::streaming_recognize_request::StreamingRequest;
use google_api_proto::google::cloud::speech::v1::{
    recognition_config::AudioEncoding, speech_client::SpeechClient, RecognitionConfig,
    StreamingRecognitionConfig, StreamingRecognizeRequest,
};
use google_authz::{Credentials, GoogleAuthz};
use log::{debug, info};
use tokio::io::AsyncReadExt;
use tonic::transport::Channel;
use tracing::Instrument;

#[tokio::main]
async fn main() -> eyre::Result<()> {
    tracing_subscriber::fmt::init();
    //console_subscriber::init();

    debug!("starting...");

    let channel = Channel::from_static("https://speech.googleapis.com")
        .connect()
        .await?;
    // let channel_translate = Channel::from_static("https://translate.googleapis.com")
    //     .connect()
    //     .await?;

    let credentials = Credentials::builder()
        .json_file("i-centralvideo-dictate-dev-c184dd68967a.json".as_ref())
        .build()
        .await?;
    let channel = GoogleAuthz::builder(channel)
        .credentials(credentials)
        .build()
        .await;

    debug!("authenticated channel created!");

    let mut client = SpeechClient::new(channel);


    let (sender, mut receiver) = tokio::sync::mpsc::unbounded_channel();

    sender.send(StreamingRecognizeRequest {
        streaming_request: Some(StreamingRequest::StreamingConfig(
            StreamingRecognitionConfig {
                config: Some(RecognitionConfig {
                    encoding: AudioEncoding::Flac.into(), // matching current example file
                    sample_rate_hertz: 44_100,            // matching current example file
                    audio_channel_count: 2,
                    language_code: "en-US".to_string(),   // we only support en-US to start with
                    model: "video".to_string(),           // dictate does not set this option
                    use_enhanced: true,                   // dictate does not set this option
                    profanity_filter: true,               // used by Dictate, so we also use it here
                    enable_word_time_offsets: true, // important so we can get the spoken word time ranges
                    max_alternatives: 1,            // make sure the default is used
                    ..Default::default()
                }),
                single_utterance: false,
                interim_results: false,
            },
        )),
    })?;

    tokio::spawn(async move {
        let file = tokio::fs::File::open("some-audio.flac").await.unwrap();
        let mut audio_file = tokio::io::BufReader::new(file);
        // read file chunk
        let mut buffer = [0; 1024 * 50];
        while let Ok(n) = audio_file.read(&mut buffer[..]).await {
            // send to server
            let request = StreamingRecognizeRequest {
                streaming_request: Some(StreamingRequest::AudioContent(
                    BytesMut::from(&buffer.as_slice()[..n]).freeze(),
                )),
            };
            sender.send(request).unwrap();
            //debug!("added a buffer to the sender queue: {} bytes", n);
            tokio::time::sleep(std::time::Duration::from_millis(100)).await;
        }
    }).instrument(tracing::info_span!("audio-source")).await?;


    let message = async_stream::stream! {
        while let Some(message) = receiver.recv().await {
            debug!("drained message inside stream...");
            yield message;
        }
    };

    let response = client
        .streaming_recognize(tonic::Request::new(message))
        .await?;
    let mut inbound = response.into_inner();

    while let Some(response) = inbound.message().instrument(tracing::info_span!("transcription-results")).await? {
        let mut num_results = 0;
        for res in &response.results {
            num_results = num_results + 1;
            info!("Result {} {{", num_results);
            if let Some(rec) = res.alternatives.first() {
                info!("\tTranscription: {}", rec.transcript);
                for word_info in &rec.words {
                    // let start_time: WordTimestamp = word_info.start_time.into();
                    let start_time = word_info.start_time.as_ref().unwrap();
                    let end_time = word_info.end_time.as_ref().unwrap();
                    info!(
                        "\t - {}: [{}.{} - {}.{}]",
                        word_info.word,
                        start_time.seconds,
                        start_time.nanos,
                        end_time.seconds,
                        end_time.nanos
                    );
                }
            }
            info!("}}");
        }
    }

    Ok(())
}