Spaces:
Running
Running
Commit
·
95841bc
0
Parent(s):
Initial commit: Complete MediVox application
Browse files- .gitattributes +1 -0
- .gitignore +23 -0
- README.md +60 -0
- app.py +153 -0
- brain.py +42 -0
- doctorvoice.py +112 -0
- packages.txt +11 -0
- patientvoice.py +57 -0
- requirements.txt +23 -0
.gitattributes
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
medical.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Environment
|
2 |
+
.env
|
3 |
+
medenv/
|
4 |
+
venv/
|
5 |
+
__pycache__/
|
6 |
+
|
7 |
+
# Generated files
|
8 |
+
*.pyc
|
9 |
+
*.mp3
|
10 |
+
*.wav
|
11 |
+
*.jpg
|
12 |
+
download.jpg
|
13 |
+
Temp.mp3
|
14 |
+
final.mp3
|
15 |
+
patient_voice.mp3
|
16 |
+
|
17 |
+
# Large files
|
18 |
+
medical.pdf
|
19 |
+
vectorstore/
|
20 |
+
|
21 |
+
# IDE
|
22 |
+
.vscode/
|
23 |
+
.idea/
|
README.md
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: MediVox - AI Doctor with Vision and Voice
|
3 |
+
emoji: 👨⚕️
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.16.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
# AI Doctor with Vision and Voice
|
13 |
+
|
14 |
+
This is an AI-powered medical assistant that can:
|
15 |
+
- Accept voice input from patients
|
16 |
+
- Analyze medical images
|
17 |
+
- Provide medical insights using RAG (Retrieval Augmented Generation)
|
18 |
+
- Respond with natural voice output
|
19 |
+
|
20 |
+
## Features
|
21 |
+
|
22 |
+
- Speech-to-Text using Whisper
|
23 |
+
- Image Analysis using LLaVA
|
24 |
+
- RAG using FAISS and medical knowledge base
|
25 |
+
- Text-to-Speech using ElevenLabs
|
26 |
+
- Context-aware responses using medical domain knowledge
|
27 |
+
|
28 |
+
## Environment Variables Required
|
29 |
+
|
30 |
+
```bash
|
31 |
+
GROQ_API_KEY=your_groq_api_key
|
32 |
+
ELEVENLABS_API_KEY=your_elevenlabs_api_key
|
33 |
+
```
|
34 |
+
|
35 |
+
## Usage
|
36 |
+
|
37 |
+
1. Click the microphone button to record your question
|
38 |
+
2. Upload or take a picture of the medical condition
|
39 |
+
3. Wait for the AI doctor to analyze and respond
|
40 |
+
4. Listen to the voice response or read the text output
|
41 |
+
|
42 |
+
## Model Details
|
43 |
+
|
44 |
+
- Vision Model: LLaVA 3.2 11B
|
45 |
+
- Speech-to-Text: Whisper Large V3
|
46 |
+
- Text Generation: Groq
|
47 |
+
- Voice Generation: ElevenLabs
|
48 |
+
- Embeddings: sentence-transformers/all-mpnet-base-v2
|
49 |
+
|
50 |
+
## Citation
|
51 |
+
|
52 |
+
If you use this space, please cite:
|
53 |
+
```
|
54 |
+
@misc{medivoicebot2024,
|
55 |
+
author = {Your Name},
|
56 |
+
title = {AI Doctor with Vision and Voice},
|
57 |
+
year = {2024},
|
58 |
+
publisher = {Hugging Face Spaces},
|
59 |
+
}
|
60 |
+
```
|
app.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import pathlib
|
4 |
+
import torch
|
5 |
+
import faiss
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
|
8 |
+
from brain import encode_image, analyze_image_with_query
|
9 |
+
from patientvoice import record_audio, transcribe_with_groq
|
10 |
+
from doctorvoice import text_to_speech_with_gtts, text_to_speech_with_elevenlabs
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
load_dotenv()
|
13 |
+
from langchain_community.vectorstores import FAISS
|
14 |
+
from langchain_core.embeddings import Embeddings
|
15 |
+
from langchain_core.prompts import ChatPromptTemplate
|
16 |
+
from langchain_community.document_loaders import PyPDFLoader
|
17 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
18 |
+
|
19 |
+
# Check if CUDA is available
|
20 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
21 |
+
print(f"Using device: {device}")
|
22 |
+
|
23 |
+
# Initialize embeddings model
|
24 |
+
class SentenceTransformerEmbeddings(Embeddings):
|
25 |
+
def __init__(self, model_name: str, device: str = None):
|
26 |
+
self.model = SentenceTransformer(model_name, device=device)
|
27 |
+
|
28 |
+
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
29 |
+
embeddings = self.model.encode(texts, convert_to_tensor=False)
|
30 |
+
return embeddings.tolist()
|
31 |
+
|
32 |
+
def embed_query(self, text: str) -> list[float]:
|
33 |
+
embedding = self.model.encode(text, convert_to_tensor=False)
|
34 |
+
return embedding.tolist()
|
35 |
+
|
36 |
+
embeddings = SentenceTransformerEmbeddings(
|
37 |
+
model_name="sentence-transformers/all-mpnet-base-v2",
|
38 |
+
device=device
|
39 |
+
)
|
40 |
+
|
41 |
+
# Define vectorstore paths consistently
|
42 |
+
VECTORSTORE_DIR = "vectorstore/db_faiss"
|
43 |
+
vectorstore_path = pathlib.Path(VECTORSTORE_DIR)
|
44 |
+
|
45 |
+
# Create vectorstore directory if it doesn't exist
|
46 |
+
vectorstore_path.mkdir(parents=True, exist_ok=True)
|
47 |
+
|
48 |
+
if not (vectorstore_path / "index.faiss").exists():
|
49 |
+
print("Creating new vectorstore...")
|
50 |
+
# Load and split the PDF
|
51 |
+
loader = PyPDFLoader("medical.pdf")
|
52 |
+
documents = loader.load()
|
53 |
+
|
54 |
+
# Split documents into chunks
|
55 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
56 |
+
chunk_size=1000,
|
57 |
+
chunk_overlap=200,
|
58 |
+
length_function=len,
|
59 |
+
)
|
60 |
+
texts = text_splitter.split_documents(documents)
|
61 |
+
|
62 |
+
# Create and save the vectorstore
|
63 |
+
vectorstore = FAISS.from_documents(texts, embeddings)
|
64 |
+
|
65 |
+
# If CUDA is available, convert index to GPU
|
66 |
+
if device == "cuda":
|
67 |
+
res = faiss.StandardGpuResources() # Initialize GPU resources
|
68 |
+
index = vectorstore.index
|
69 |
+
gpu_index = faiss.index_cpu_to_gpu(res, 0, index) # Move to GPU
|
70 |
+
vectorstore.index = gpu_index
|
71 |
+
|
72 |
+
# Save the vectorstore
|
73 |
+
vectorstore.save_local(VECTORSTORE_DIR)
|
74 |
+
print("Vectorstore created and saved successfully.")
|
75 |
+
else:
|
76 |
+
print("Loading existing vectorstore...")
|
77 |
+
# Load existing vectorstore
|
78 |
+
vectorstore = FAISS.load_local(
|
79 |
+
folder_path=VECTORSTORE_DIR,
|
80 |
+
embeddings=embeddings,
|
81 |
+
allow_dangerous_deserialization=True
|
82 |
+
)
|
83 |
+
|
84 |
+
# If CUDA is available, convert loaded index to GPU
|
85 |
+
if device == "cuda":
|
86 |
+
res = faiss.StandardGpuResources() # Initialize GPU resources
|
87 |
+
index = vectorstore.index
|
88 |
+
gpu_index = faiss.index_cpu_to_gpu(res, 0, index) # Move to GPU
|
89 |
+
vectorstore.index = gpu_index
|
90 |
+
print("Vectorstore loaded successfully.")
|
91 |
+
|
92 |
+
def get_relevant_context(query):
|
93 |
+
try:
|
94 |
+
# Search the vector store for relevant documents
|
95 |
+
docs = vectorstore.similarity_search(query, k=2)
|
96 |
+
|
97 |
+
# Extract and combine the content from retrieved documents
|
98 |
+
context = "\n".join([doc.page_content for doc in docs])
|
99 |
+
|
100 |
+
return context
|
101 |
+
except Exception as e:
|
102 |
+
print(f"Error in similarity search: {e}")
|
103 |
+
return "Could not retrieve relevant context."
|
104 |
+
|
105 |
+
# Update system prompt to include retrieved context
|
106 |
+
def get_enhanced_prompt(query, context):
|
107 |
+
enhanced_prompt = f"""You have to act as a professional doctor, i know you are not but this is for learning purpose.
|
108 |
+
Use the following medical context to inform your response: {context}
|
109 |
+
What's in this image?. Do you find anything wrong with it medically?
|
110 |
+
If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
|
111 |
+
your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
|
112 |
+
Donot say 'In the image I see' but say 'With what I see, I think you have ....'
|
113 |
+
Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
|
114 |
+
Keep your answer concise (max 2 sentences). No preamble, start your answer right away please.
|
115 |
+
Question from patient: {query}"""
|
116 |
+
return enhanced_prompt
|
117 |
+
|
118 |
+
def process_inputs(audio_filepath, image_filepath):
|
119 |
+
speech_to_text_output = transcribe_with_groq(GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
|
120 |
+
audio_filepath=audio_filepath,
|
121 |
+
stt_model="whisper-large-v3")
|
122 |
+
|
123 |
+
# Get relevant context from the vector store
|
124 |
+
context = get_relevant_context(speech_to_text_output)
|
125 |
+
|
126 |
+
# Handle the image input
|
127 |
+
if image_filepath:
|
128 |
+
enhanced_prompt = get_enhanced_prompt(speech_to_text_output, context)
|
129 |
+
doctor_response = analyze_image_with_query(query=enhanced_prompt, encoded_image=encode_image(image_filepath), model="llama-3.2-11b-vision-preview")
|
130 |
+
else:
|
131 |
+
doctor_response = "No image provided for me to analyze"
|
132 |
+
|
133 |
+
voice_of_doctor = text_to_speech_with_elevenlabs(input_text=doctor_response, output_filepath="final.mp3")
|
134 |
+
|
135 |
+
return speech_to_text_output, doctor_response, voice_of_doctor
|
136 |
+
|
137 |
+
|
138 |
+
# Create the interface
|
139 |
+
iface = gr.Interface(
|
140 |
+
fn=process_inputs,
|
141 |
+
inputs=[
|
142 |
+
gr.Audio(sources=["microphone"], type="filepath"),
|
143 |
+
gr.Image(type="filepath")
|
144 |
+
],
|
145 |
+
outputs=[
|
146 |
+
gr.Textbox(label="Speech to Text"),
|
147 |
+
gr.Textbox(label="Doctor's Response"),
|
148 |
+
gr.Audio("Temp.mp3")
|
149 |
+
],
|
150 |
+
title="AI Doctor with Vision and Voice"
|
151 |
+
)
|
152 |
+
|
153 |
+
iface.launch(debug=True)
|
brain.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
|
4 |
+
# Load environment variables
|
5 |
+
load_dotenv()
|
6 |
+
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
|
7 |
+
|
8 |
+
import base64
|
9 |
+
def encode_image(image_path):
|
10 |
+
image_file=open(image_path, "rb")
|
11 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
12 |
+
|
13 |
+
#Step3: Setup Multimodal LLM
|
14 |
+
from groq import Groq
|
15 |
+
|
16 |
+
query="Is there something wrong with my face?"
|
17 |
+
model="llama-3.2-90b-vision-preview"
|
18 |
+
|
19 |
+
def analyze_image_with_query(query, model, encoded_image):
|
20 |
+
client=Groq()
|
21 |
+
messages=[
|
22 |
+
{
|
23 |
+
"role": "user",
|
24 |
+
"content": [
|
25 |
+
{
|
26 |
+
"type": "text",
|
27 |
+
"text": query
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"type": "image_url",
|
31 |
+
"image_url": {
|
32 |
+
"url": f"data:image/jpeg;base64,{encoded_image}",
|
33 |
+
},
|
34 |
+
},
|
35 |
+
],
|
36 |
+
}]
|
37 |
+
chat_completion=client.chat.completions.create(
|
38 |
+
messages=messages,
|
39 |
+
model=model
|
40 |
+
)
|
41 |
+
|
42 |
+
return chat_completion.choices[0].message.content
|
doctorvoice.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# if you dont use pipenv uncomment the following:
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
load_dotenv()
|
4 |
+
|
5 |
+
#Step1a: Setup Text to Speech–TTS–model with gTTS
|
6 |
+
import os
|
7 |
+
from gtts import gTTS
|
8 |
+
|
9 |
+
def text_to_speech_with_gtts_old(input_text, output_filepath):
|
10 |
+
language="en"
|
11 |
+
|
12 |
+
audioobj= gTTS(
|
13 |
+
text=input_text,
|
14 |
+
lang=language,
|
15 |
+
slow=False
|
16 |
+
)
|
17 |
+
audioobj.save(output_filepath)
|
18 |
+
|
19 |
+
|
20 |
+
# input_text="Hi"
|
21 |
+
# text_to_speech_with_gtts_old(input_text=input_text, output_filepath="gtts_testing.mp3")
|
22 |
+
|
23 |
+
#Step1b: Setup Text to Speech–TTS–model with ElevenLabs
|
24 |
+
import elevenlabs
|
25 |
+
from elevenlabs.client import ElevenLabs
|
26 |
+
|
27 |
+
ELEVENLABS_API_KEY=os.environ.get("ELEVENLABS_API_KEY")
|
28 |
+
|
29 |
+
def text_to_speech_with_elevenlabs_old(input_text, output_filepath):
|
30 |
+
client=ElevenLabs(api_key=ELEVENLABS_API_KEY)
|
31 |
+
audio=client.generate(
|
32 |
+
text= input_text,
|
33 |
+
voice= "Emily",
|
34 |
+
output_format= "mp3_22050_32",
|
35 |
+
model= "eleven_turbo_v2"
|
36 |
+
)
|
37 |
+
elevenlabs.save(audio, output_filepath)
|
38 |
+
|
39 |
+
# text_to_speech_with_elevenlabs_old(input_text, output_filepath="elevenlabs_testing.mp3")
|
40 |
+
|
41 |
+
# #Step2: Use Model for Text output to Voice
|
42 |
+
# when the files of the doctor gets saved, they dont play automatically so we have to do this step 2 in order to automatically run the audio files.
|
43 |
+
import subprocess
|
44 |
+
import platform
|
45 |
+
from pydub import AudioSegment
|
46 |
+
from pydub.playback import play
|
47 |
+
import tempfile
|
48 |
+
|
49 |
+
def text_to_speech_with_gtts(input_text, output_filepath):
|
50 |
+
language="en"
|
51 |
+
|
52 |
+
audioobj= gTTS(
|
53 |
+
text=input_text,
|
54 |
+
lang=language,
|
55 |
+
slow=False
|
56 |
+
)
|
57 |
+
audioobj.save(output_filepath)
|
58 |
+
os_name = platform.system()
|
59 |
+
try:
|
60 |
+
if os_name == "Darwin": # macOS
|
61 |
+
subprocess.run(['afplay', output_filepath])
|
62 |
+
elif os_name == "Windows": # Windows
|
63 |
+
subprocess.run(['powershell', '-c', f'(New-Object Media.SoundPlayer "{output_filepath}").PlaySync();'])
|
64 |
+
elif os_name == "Linux": # Linux
|
65 |
+
subprocess.run(['aplay', output_filepath]) # Alternative: use 'mpg123' or 'ffplay'
|
66 |
+
else:
|
67 |
+
raise OSError("Unsupported operating system")
|
68 |
+
except Exception as e:
|
69 |
+
print(f"An error occurred while trying to play the audio: {e}")
|
70 |
+
|
71 |
+
|
72 |
+
# input_text="Hi"
|
73 |
+
# #text_to_speech_with_gtts(input_text=input_text, output_filepath="gtts_testing_autoplay.mp3")
|
74 |
+
|
75 |
+
def play_audio(file_path):
|
76 |
+
os_name = platform.system()
|
77 |
+
try:
|
78 |
+
if os_name == "Darwin": # macOS
|
79 |
+
subprocess.run(['afplay', file_path])
|
80 |
+
elif os_name == "Windows": # Windows
|
81 |
+
# Load MP3 and convert to WAV for playback
|
82 |
+
audio = AudioSegment.from_mp3(file_path)
|
83 |
+
# Create a temporary WAV file
|
84 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
|
85 |
+
wav_path = temp_wav.name
|
86 |
+
audio.export(wav_path, format='wav')
|
87 |
+
# Play the WAV file
|
88 |
+
subprocess.run(['powershell', '-c', f'(New-Object Media.SoundPlayer "{wav_path}").PlaySync();'])
|
89 |
+
# Clean up temporary file
|
90 |
+
os.unlink(wav_path)
|
91 |
+
elif os_name == "Linux": # Linux
|
92 |
+
subprocess.run(['mpg123', file_path]) # Using mpg123 for MP3 playback
|
93 |
+
else:
|
94 |
+
raise OSError("Unsupported operating system")
|
95 |
+
except Exception as e:
|
96 |
+
print(f"An error occurred while trying to play the audio: {e}")
|
97 |
+
|
98 |
+
def text_to_speech_with_elevenlabs(input_text, output_filepath):
|
99 |
+
client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
|
100 |
+
audio = client.generate(
|
101 |
+
text=input_text,
|
102 |
+
voice="Aria",
|
103 |
+
output_format="mp3_22050_32",
|
104 |
+
model="eleven_turbo_v2"
|
105 |
+
)
|
106 |
+
elevenlabs.save(audio, output_filepath)
|
107 |
+
|
108 |
+
# Play the audio
|
109 |
+
play_audio(output_filepath)
|
110 |
+
return output_filepath
|
111 |
+
|
112 |
+
# text_to_speech_with_elevenlabs(input_text, output_filepath="elevenlabs_testing_autoplay.mp3")
|
packages.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python3-dev
|
2 |
+
portaudio19-dev
|
3 |
+
python3-pyaudio
|
4 |
+
ffmpeg
|
5 |
+
libsndfile1
|
6 |
+
build-essential
|
7 |
+
pkg-config
|
8 |
+
git
|
9 |
+
libasound2-dev
|
10 |
+
python3-all-dev
|
11 |
+
libportaudio2
|
patientvoice.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import speech_recognition as sr
|
3 |
+
from pydub import AudioSegment
|
4 |
+
from io import BytesIO
|
5 |
+
|
6 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
7 |
+
|
8 |
+
def record_audio(file_path, timeout=20, phrase_time_limit=None):
|
9 |
+
"""
|
10 |
+
Simplified function to record audio from the microphone and save it as an MP3 file.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
file_path (str): Path to save the recorded audio file.
|
14 |
+
timeout (int): Maximum time to wait for a phrase to start (in seconds).
|
15 |
+
phrase_time_lfimit (int): Maximum time for the phrase to be recorded (in seconds).
|
16 |
+
"""
|
17 |
+
recognizer = sr.Recognizer()
|
18 |
+
|
19 |
+
try:
|
20 |
+
with sr.Microphone() as source:
|
21 |
+
logging.info("Adjusting for ambient noise...")
|
22 |
+
recognizer.adjust_for_ambient_noise(source, duration=1)
|
23 |
+
logging.info("Start speaking now...")
|
24 |
+
|
25 |
+
# Record the audio
|
26 |
+
audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit)
|
27 |
+
logging.info("Recording complete.")
|
28 |
+
|
29 |
+
# Convert the recorded audio to an MP3 file
|
30 |
+
wav_data = audio_data.get_wav_data()
|
31 |
+
audio_segment = AudioSegment.from_wav(BytesIO(wav_data))
|
32 |
+
audio_segment.export(file_path, format="mp3", bitrate="128k")
|
33 |
+
|
34 |
+
logging.info(f"Audio saved to {file_path}")
|
35 |
+
|
36 |
+
except Exception as e:
|
37 |
+
logging.error(f"An error occurred: {e}")
|
38 |
+
|
39 |
+
import os
|
40 |
+
from groq import Groq
|
41 |
+
from dotenv import load_dotenv
|
42 |
+
|
43 |
+
load_dotenv()
|
44 |
+
GROQ_API_KEY=os.environ.get("GROQ_API_KEY")
|
45 |
+
stt_model="whisper-large-v3"
|
46 |
+
|
47 |
+
def transcribe_with_groq(stt_model, audio_filepath, GROQ_API_KEY):
|
48 |
+
client=Groq(api_key=GROQ_API_KEY)
|
49 |
+
|
50 |
+
audio_file=open(audio_filepath, "rb")
|
51 |
+
transcription=client.audio.transcriptions.create(
|
52 |
+
model=stt_model,
|
53 |
+
file=audio_file,
|
54 |
+
language="en"
|
55 |
+
)
|
56 |
+
|
57 |
+
return transcription.text
|
requirements.txt
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
python-dotenv
|
3 |
+
groq
|
4 |
+
langchain
|
5 |
+
langchain-core
|
6 |
+
langchain-community
|
7 |
+
sentence-transformers
|
8 |
+
chromadb
|
9 |
+
PyPDF2
|
10 |
+
transformers
|
11 |
+
torch
|
12 |
+
torchaudio
|
13 |
+
SpeechRecognition
|
14 |
+
pydub
|
15 |
+
ffmpeg-python
|
16 |
+
gTTS
|
17 |
+
elevenlabs
|
18 |
+
faiss-cpu
|
19 |
+
requests
|
20 |
+
numpy
|
21 |
+
typing-inspect
|
22 |
+
typing_extensions
|
23 |
+
pypdf
|