Spaces:

S0h9l
/

Coherent_Speech

Runtime error

File size: 3,570 Bytes

f345c14
fb478e7
 
c2112a9
f345c14
653a5f3
fb478e7
 
 
 
953d4fd
fb478e7
f345c14
 
8d2c29a
 
 
fb478e7
 
 
 
 
 
 
 
 
 
 
 
 
ebb66c9
fb478e7
 
 
 
 
4b0025d
953d4fd
fb478e7
 
 
 
 
 
 
 
 
c2112a9
 
 
f345c14
c2112a9
c6cb9fd
c2112a9
fb478e7
 
 
 
3946654
ddfb052
fb478e7
 
3109e9d
f345c14
 
fb478e7
2c8b634
0d1b1a4

import gradio as gr
import whisper
import cohere
from deep_translator import GoogleTranslator
from gtts import gTTS
import gtts.langs
#from dotenv import load_dotenv

#load_dotenv()

model = whisper.load_model("base")

LANGUAGES = list(gtts.lang.tts_langs())

def transcribe(api,audio,language):
    co = cohere.Client(api) 

    #time.sleep(3)
    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)
    print(f"Detected language: {max(probs, key=probs.get)}")

    # decode the audio
    options = whisper.DecodingOptions(fp16 = False)
    result = whisper.decode(model, mel, options)

    #cohere
    response = co.generate(
    model='xlarge',
    prompt=f'This program will generate an introductory paragraph to a blog post given a blog title, audience, and tone of voice.\n--\nBlog Title: Best Activities in Toronto\nAudience: Millennials\nTone of Voice: Lighthearted\nFirst Paragraph: Looking for fun things to do in Toronto? When it comes to exploring Canada\'s largest city, there\'s an ever-evolving set of activities to choose from. Whether you\'re looking to visit a local museum or sample the city\'s varied cuisine, there is plenty to fill any itinerary. In this blog post, I\'ll share some of my favorite recommendations\n--\nBlog Title: Mastering Dynamic Programming\nAudience: Developers\nTone: Informative\nFirst Paragraph: In this piece, we\'ll help you understand the fundamentals of dynamic programming, and when to apply this optimization technique. We\'ll break down bottom-up and top-down approaches to solve dynamic programming problems.\n--\nBlog Title: How to Get Started with Rock Climbing\nAudience: Athletes\nTone: Enthusiastic\nFirst Paragraph:If you\'re an athlete who\'s looking to learn how to rock climb, then you\'ve come to the right place! This blog post will give you all the information you need to know about how to get started in the sport. Rock climbing is a great way to stay active and challenge yourself in a new way. It\'s also a great way to make new friends and explore new places. So, what are you waiting for? Get out there and start climbing!\n--\nBlog Title: {result.text}\nAudience: Engineers\nTone: Enthusiastic\nFirst Paragraph:',
    max_tokens=200,
    temperature=0.8,
    k=0,
    p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop_sequences=["--"],
    return_likelihoods='NONE')
    #result.text
    reptxt = response.generations[0].text.strip("--")

    #Google models
    translated = GoogleTranslator(source='auto', target=language).translate(reptxt)
    filename = 'result.mp3'
    tts = gTTS(text=translated, lang=language)
    tts.save(filename)
    return filename, translated



gr.Interface(
    title = 'Coherent Speech', 
    description = 'Enter the API key, then start recording give your input, stop recording, select language;language can also be selected after the output. Do not worry about error message in the output section',
    fn=transcribe, 
    inputs=[
        gr.inputs.Textbox(lines=1, label="Enter your Cohere API Key"),
        gr.inputs.Audio(source="microphone", type="filepath"), 
            gr.Radio(label="Language", choices=LANGUAGES, value="en")
    ],
    outputs=[gr.Audio(label="Output",type="filepath"),gr.outputs.Textbox(label="Generated Text")],
    live=True).launch()