m-adil-ali commited on
Commit
722737c
·
verified ·
1 Parent(s): 5f29348

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -0
app.py CHANGED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import soundfile as sf
3
+ from transformers import pipeline, AutoProcessor, AutoModelForCTC, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTextToSpectrogram
4
+ from pydub import AudioSegment
5
+ from pydub.playback import play
6
+ import torch
7
+
8
+ # Load models
9
+ asr_model = "facebook/wav2vec2-large-robust-ft-swbd-300h"
10
+ grammar_model = "google-t5/t5-base"
11
+ tts_model = "microsoft/speecht5_tts"
12
+
13
+ # Initialize pipelines and models
14
+ asr_pipeline = pipeline("automatic-speech-recognition", model=asr_model)
15
+ grammar_tokenizer = AutoTokenizer.from_pretrained(grammar_model)
16
+ grammar_model = AutoModelForSeq2SeqLM.from_pretrained(grammar_model)
17
+ tts_processor = AutoProcessor.from_pretrained(tts_model)
18
+ tts_model = AutoModelForTextToSpectrogram.from_pretrained(tts_model)
19
+
20
+ # Streamlit UI setup
21
+ st.title("Voice-to-Voice Chatbot with Grammar Correction")
22
+
23
+ # Real-time voice input
24
+ st.header("Step 1: Record your voice")
25
+ st.info("Press the button below to start recording your voice. Speak clearly into your microphone.")
26
+
27
+ # Record audio in real-time
28
+ if st.button("Record Voice"):
29
+ st.write("Recording...")
30
+ # This is where you'd integrate real-time recording, for now, we're simulating with a file
31
+ # For real-time, you would need a library capable of streaming audio into Python, e.g., PyAudio.
32
+ # Here, we will simulate with an uploaded file for simplicity.
33
+ uploaded_file = st.file_uploader("Choose a WAV file", type="wav")
34
+
35
+ if uploaded_file is not None:
36
+ st.audio(uploaded_file, format='audio/wav')
37
+ # Convert the uploaded WAV file to text
38
+ audio_input, _ = sf.read(uploaded_file)
39
+ with torch.no_grad():
40
+ transcription = asr_pipeline(audio_input)["text"]
41
+
42
+ st.write("Transcription:", transcription)
43
+
44
+ # Grammar correction
45
+ st.header("Step 2: Correcting Grammar")
46
+ st.write("Original Text:", transcription)
47
+
48
+ grammar_input_text = f"correct: {transcription}"
49
+ grammar_inputs = grammar_tokenizer(grammar_input_text, return_tensors="pt", max_length=512, truncation=True)
50
+ grammar_outputs = grammar_model.generate(**grammar_inputs)
51
+ corrected_text = grammar_tokenizer.decode(grammar_outputs[0], skip_special_tokens=True)
52
+
53
+ st.write("Corrected Text:", corrected_text)
54
+
55
+ # Generate a reply
56
+ st.header("Step 3: Generate Reply")
57
+ reply_input = f"reply: {corrected_text}"
58
+ reply_inputs = grammar_tokenizer(reply_input, return_tensors="pt", max_length=512, truncation=True)
59
+ reply_outputs = grammar_model.generate(**reply_inputs)
60
+ reply_text = grammar_tokenizer.decode(reply_outputs[0], skip_special_tokens=True)
61
+
62
+ st.write("Generated Reply:", reply_text)
63
+
64
+ # Convert reply to voice
65
+ st.header("Step 4: Convert Reply to Voice")
66
+ input_ids = tts_processor(text=reply_text, return_tensors="pt").input_ids
67
+ with torch.no_grad():
68
+ speech = tts_model.generate(input_ids).speech
69
+
70
+ # Save and play the generated speech
71
+ sf.write("output.wav", speech.numpy(), 16000)
72
+ st.audio("output.wav", format="audio/wav")
73
+ audio = AudioSegment.from_wav("output.wav")
74
+ play(audio)
75
+
76
+ st.success("Process completed! You can continue the conversation.")