Spaces:

awacke1
/

SpeechStoryReadAloud

Sleeping

File size: 5,033 Bytes

import streamlit as st
import firebase_admin
import datetime
import gradio as gr
import numpy as np
import tempfile

from firebase_admin import credentials
from firebase_admin import firestore
from transformers import pipeline
from typing import Optional
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
from gradio import inputs
from gradio.inputs import Textbox
from gradio import outputs

#Persistence via Cloud Store
@st.experimental_singleton
def get_db_firestore():
    cred = credentials.Certificate('test.json')
    firebase_admin.initialize_app(cred, {'projectId': u'clinical-nlp-b9117',})
    db = firestore.client()
    return db
db = get_db_firestore()
asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")

#STT Models
MODEL_NAMES = [
    "en/ljspeech/tacotron2-DDC",
    "en/ljspeech/glow-tts",
    "en/ljspeech/speedy-speech-wn",
    "en/ljspeech/vits",
    #"en/sam/tacotron-DDC",
    #"fr/mai/tacotron2-DDC",
    #"de/thorsten/tacotron2-DCA",
]
MODELS = {}
manager = ModelManager()
for MODEL_NAME in MODEL_NAMES:
    print(f"downloading {MODEL_NAME}")
    model_path, config_path, model_item = manager.download_model(f"tts_models/{MODEL_NAME}")
    vocoder_name: Optional[str] = model_item["default_vocoder"]
    vocoder_path = None
    vocoder_config_path = None
    if vocoder_name is not None:
        vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)

    synthesizer = Synthesizer(
        model_path, config_path, None, vocoder_path, vocoder_config_path,
    )
    MODELS[MODEL_NAME] = synthesizer

GEN_NAMES = [
    "huggingface/EleutherAI/gpt-neo-2.7B",
    "huggingface/EleutherAI/gpt-j-6B",
    "huggingface/gpt2-large"
]


#ASR
def transcribe(audio):
    text = asr(audio)["text"]
    return text

#Sentiment Classifier
classifier = pipeline("text-classification")

# GPT-J: Story Generation Pipeline
story_gen = pipeline("text-generation", "pranavpsv/gpt2-genre-story-generator")


#STT
def speech_to_text(speech):
    text = asr(speech)["text"]
    return text
    
#TTSentiment
def text_to_sentiment(text):
    sentiment = classifier(text)[0]["label"]
    return sentiment 

#Save
def upsert(text):
    date_time =str(datetime.datetime.today())
    doc_ref = db.collection('Text2SpeechSentimentSave').document(date_time)
    doc_ref.set({u'firefield': 'Recognize Speech', u'first': 'https://huggingface.co/spaces/awacke1/TTS-STT-Blocks/', u'last': text, u'born': date_time,})
    saved = select('TTS-STT', date_time)
    # check it here:  https://console.firebase.google.com/u/0/project/clinical-nlp-b9117/firestore/data/~2FStreamlitSpaces
    return saved
      
#OpenLast
def select(collection, document):
    doc_ref = db.collection(collection).document(document)
    doc = doc_ref.get()
    docid = ("The id is: ", doc.id)
    contents = ("The contents are: ", doc.to_dict())
    return contents

#OpenAll   
def selectall(text):
    docs = db.collection('Text2SpeechSentimentSave').stream()
    doclist=''
    for doc in docs:
        r=(f'{doc.id} => {doc.to_dict()}')
        doclist += r
    return doclist 

#TTS
def tts(text: str, model_name: str):
    print(text, model_name)
    synthesizer = MODELS.get(model_name, None)
    if synthesizer is None:
        raise NameError("model not found")
    wavs = synthesizer.tts(text)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        synthesizer.save_wav(wavs, fp)
        return fp.name
        

#Blocks Rock It
demo = gr.Blocks()
with demo:
    #UI
    audio_file = gr.inputs.Audio(source="microphone", type="filepath")
    text = gr.Textbox()
    label = gr.Label()
    saved = gr.Textbox()
    savedAll = gr.Textbox()
    TTSchoice = gr.inputs.Radio( label="Pick a TTS Model", choices=MODEL_NAMES,   )
    audio = gr.Audio(label="Output", interactive=False)
    
    
    #Buttons
    b1 = gr.Button("Recognize Speech")
    b2 = gr.Button("Classify Sentiment")
    b3 = gr.Button("Save Speech to Text")
    b4 = gr.Button("Retrieve All")
    b5 = gr.Button("Read It Back Aloud")

    #Event Model Chains
    b1.click(speech_to_text, inputs=audio_file, outputs=text)
    b2.click(text_to_sentiment, inputs=text, outputs=label)
    b3.click(upsert, inputs=text, outputs=saved)
    b4.click(selectall, inputs=text, outputs=savedAll)
    b5.click(tts,  inputs=[text,TTSchoice], outputs=audio)

# Lets Do It
demo.launch(share=True)

title = "Story Generators"
examples = [
    ["At which point do we invent Love?"],
    ["Love is a capacity more than consciousness is universal."],
    ["See the grace of god in eachother."],
    ["Love is a capacity more than consciousness is universal."],
    ["Love is generativity when there is more energy than what they need for equilibrium."],
    ["Collections of people have agency and mass having agency at the mesoscopic level"],
    ["Having a deep human connection is an interface problem to solve."],
    ["Having a collective creates agency since we build trust in eachother."]
]