Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import soundfile as sf
|
3 |
+
from transformers import pipeline, AutoProcessor, AutoModelForCTC, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTextToSpectrogram
|
4 |
+
from pydub import AudioSegment
|
5 |
+
from pydub.playback import play
|
6 |
+
import torch
|
7 |
+
|
8 |
+
# Load models
|
9 |
+
asr_model = "facebook/wav2vec2-large-robust-ft-swbd-300h"
|
10 |
+
grammar_model = "google-t5/t5-base"
|
11 |
+
tts_model = "microsoft/speecht5_tts"
|
12 |
+
|
13 |
+
# Initialize pipelines and models
|
14 |
+
asr_pipeline = pipeline("automatic-speech-recognition", model=asr_model)
|
15 |
+
grammar_tokenizer = AutoTokenizer.from_pretrained(grammar_model)
|
16 |
+
grammar_model = AutoModelForSeq2SeqLM.from_pretrained(grammar_model)
|
17 |
+
tts_processor = AutoProcessor.from_pretrained(tts_model)
|
18 |
+
tts_model = AutoModelForTextToSpectrogram.from_pretrained(tts_model)
|
19 |
+
|
20 |
+
# Streamlit UI setup
|
21 |
+
st.title("Voice-to-Voice Chatbot with Grammar Correction")
|
22 |
+
|
23 |
+
# Real-time voice input
|
24 |
+
st.header("Step 1: Record your voice")
|
25 |
+
st.info("Press the button below to start recording your voice. Speak clearly into your microphone.")
|
26 |
+
|
27 |
+
# Record audio in real-time
|
28 |
+
if st.button("Record Voice"):
|
29 |
+
st.write("Recording...")
|
30 |
+
# This is where you'd integrate real-time recording, for now, we're simulating with a file
|
31 |
+
# For real-time, you would need a library capable of streaming audio into Python, e.g., PyAudio.
|
32 |
+
# Here, we will simulate with an uploaded file for simplicity.
|
33 |
+
uploaded_file = st.file_uploader("Choose a WAV file", type="wav")
|
34 |
+
|
35 |
+
if uploaded_file is not None:
|
36 |
+
st.audio(uploaded_file, format='audio/wav')
|
37 |
+
# Convert the uploaded WAV file to text
|
38 |
+
audio_input, _ = sf.read(uploaded_file)
|
39 |
+
with torch.no_grad():
|
40 |
+
transcription = asr_pipeline(audio_input)["text"]
|
41 |
+
|
42 |
+
st.write("Transcription:", transcription)
|
43 |
+
|
44 |
+
# Grammar correction
|
45 |
+
st.header("Step 2: Correcting Grammar")
|
46 |
+
st.write("Original Text:", transcription)
|
47 |
+
|
48 |
+
grammar_input_text = f"correct: {transcription}"
|
49 |
+
grammar_inputs = grammar_tokenizer(grammar_input_text, return_tensors="pt", max_length=512, truncation=True)
|
50 |
+
grammar_outputs = grammar_model.generate(**grammar_inputs)
|
51 |
+
corrected_text = grammar_tokenizer.decode(grammar_outputs[0], skip_special_tokens=True)
|
52 |
+
|
53 |
+
st.write("Corrected Text:", corrected_text)
|
54 |
+
|
55 |
+
# Generate a reply
|
56 |
+
st.header("Step 3: Generate Reply")
|
57 |
+
reply_input = f"reply: {corrected_text}"
|
58 |
+
reply_inputs = grammar_tokenizer(reply_input, return_tensors="pt", max_length=512, truncation=True)
|
59 |
+
reply_outputs = grammar_model.generate(**reply_inputs)
|
60 |
+
reply_text = grammar_tokenizer.decode(reply_outputs[0], skip_special_tokens=True)
|
61 |
+
|
62 |
+
st.write("Generated Reply:", reply_text)
|
63 |
+
|
64 |
+
# Convert reply to voice
|
65 |
+
st.header("Step 4: Convert Reply to Voice")
|
66 |
+
input_ids = tts_processor(text=reply_text, return_tensors="pt").input_ids
|
67 |
+
with torch.no_grad():
|
68 |
+
speech = tts_model.generate(input_ids).speech
|
69 |
+
|
70 |
+
# Save and play the generated speech
|
71 |
+
sf.write("output.wav", speech.numpy(), 16000)
|
72 |
+
st.audio("output.wav", format="audio/wav")
|
73 |
+
audio = AudioSegment.from_wav("output.wav")
|
74 |
+
play(audio)
|
75 |
+
|
76 |
+
st.success("Process completed! You can continue the conversation.")
|