Spaces:

m-adil-ali
/

SpeakBuddy

Sleeping

App Files Files Community

m-adil-ali commited on Sep 1, 2024

Commit

722737c

verified ·

1 Parent(s): 5f29348

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -0

app.py CHANGED Viewed

	@@ -0,0 +1,76 @@

+import streamlit as st
+import soundfile as sf
+from transformers import pipeline, AutoProcessor, AutoModelForCTC, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTextToSpectrogram
+from pydub import AudioSegment
+from pydub.playback import play
+import torch
+# Load models
+asr_model = "facebook/wav2vec2-large-robust-ft-swbd-300h"
+grammar_model = "google-t5/t5-base"
+tts_model = "microsoft/speecht5_tts"
+# Initialize pipelines and models
+asr_pipeline = pipeline("automatic-speech-recognition", model=asr_model)
+grammar_tokenizer = AutoTokenizer.from_pretrained(grammar_model)
+grammar_model = AutoModelForSeq2SeqLM.from_pretrained(grammar_model)
+tts_processor = AutoProcessor.from_pretrained(tts_model)
+tts_model = AutoModelForTextToSpectrogram.from_pretrained(tts_model)
+# Streamlit UI setup
+st.title("Voice-to-Voice Chatbot with Grammar Correction")
+# Real-time voice input
+st.header("Step 1: Record your voice")
+st.info("Press the button below to start recording your voice. Speak clearly into your microphone.")
+# Record audio in real-time
+if st.button("Record Voice"):
+    st.write("Recording...")
+    # This is where you'd integrate real-time recording, for now, we're simulating with a file
+    # For real-time, you would need a library capable of streaming audio into Python, e.g., PyAudio.
+    # Here, we will simulate with an uploaded file for simplicity.
+    uploaded_file = st.file_uploader("Choose a WAV file", type="wav")
+    if uploaded_file is not None:
+        st.audio(uploaded_file, format='audio/wav')
+        # Convert the uploaded WAV file to text
+        audio_input, _ = sf.read(uploaded_file)
+        with torch.no_grad():
+            transcription = asr_pipeline(audio_input)["text"]
+        st.write("Transcription:", transcription)
+        # Grammar correction
+        st.header("Step 2: Correcting Grammar")
+        st.write("Original Text:", transcription)
+        grammar_input_text = f"correct: {transcription}"
+        grammar_inputs = grammar_tokenizer(grammar_input_text, return_tensors="pt", max_length=512, truncation=True)
+        grammar_outputs = grammar_model.generate(**grammar_inputs)
+        corrected_text = grammar_tokenizer.decode(grammar_outputs[0], skip_special_tokens=True)
+        st.write("Corrected Text:", corrected_text)
+        # Generate a reply
+        st.header("Step 3: Generate Reply")
+        reply_input = f"reply: {corrected_text}"
+        reply_inputs = grammar_tokenizer(reply_input, return_tensors="pt", max_length=512, truncation=True)
+        reply_outputs = grammar_model.generate(**reply_inputs)
+        reply_text = grammar_tokenizer.decode(reply_outputs[0], skip_special_tokens=True)
+        st.write("Generated Reply:", reply_text)
+        # Convert reply to voice
+        st.header("Step 4: Convert Reply to Voice")
+        input_ids = tts_processor(text=reply_text, return_tensors="pt").input_ids
+        with torch.no_grad():
+            speech = tts_model.generate(input_ids).speech
+        # Save and play the generated speech
+        sf.write("output.wav", speech.numpy(), 16000)
+        st.audio("output.wav", format="audio/wav")
+        audio = AudioSegment.from_wav("output.wav")
+        play(audio)
+        st.success("Process completed! You can continue the conversation.")