v-v-v / app.py
muhammadshaheryar's picture
Update app.py
443dd85 verified
# Install required libraries
!pip install gradio transformers torchaudio
import gradio as gr
import torch
from transformers import pipeline
import torchaudio
from torchaudio.transforms import Resample
from pydub import AudioSegment
from io import BytesIO
# Load Whisper model for speech-to-text
whisper_model = pipeline("automatic-speech-recognition", model="openai/whisper-small")
# Mock Groq LLM interaction (replace with real Groq LLM API integration)
def groq_llm_interaction(prompt):
# Replace this with actual Groq LLM API calls
return f"Echoing your input: {prompt}"
# Text-to-speech conversion
def text_to_speech(text, output_path="output.wav"):
# Use a simple TTS pipeline (can replace with more advanced models)
tts = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech")
audio = tts(text, return_tensors=True)["waveform"]
torchaudio.save(output_path, audio.squeeze(), 22050)
return output_path
# Convert raw audio to desired format
def process_audio(audio):
audio = AudioSegment.from_file(audio)
buf = BytesIO()
audio.export(buf, format="wav")
buf.seek(0)
return buf
# Chatbot function
def voice_to_voice_chat(audio_file):
# Process the audio input
processed_audio = process_audio(audio_file)
# Step 1: Speech-to-text (Whisper)
transcript = whisper_model(processed_audio)["text"]
# Step 2: LLM interaction
response_text = groq_llm_interaction(transcript)
# Step 3: Text-to-speech (TTS)
response_audio_path = text_to_speech(response_text)
gr.Audio(type="filepath")
gr.Audio(type="numpy")
interface = gr.Interface(
fn=voice_to_voice_chat,
inputs=gr.Audio(type="filepath"), # Corrected type
outputs=[
"text", # Transcription and response text
gr.Audio(type="filepath"), # Generated audio response as a file path
],
live=True # Ensure real-time interaction
)
# Launch Gradio app
interface.launch()