In [1]:
import gradio as gr
import numpy as np
import torch
import torchaudio
from silero_vad import get_speech_timestamps, load_silero_vad
import whisperx
import openai
import asyncio
import edge_tts
import gc
import logging
import time

 torchaudio.set_audio_backend("soundfile")


In [2]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load Silero VAD model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
logging.info(f'Using device: {device}')
vad_model = load_silero_vad().to(device) # Ensure the model is on the correct device
logging.info('Loaded Silero VAD model')

# Load WhisperX model
whisper_model = whisperx.load_model("tiny", device, compute_type="float16")
logging.info('Loaded WhisperX model')

openai.api_key = ""
logging.info('Set OpenAI API key')

# TTS Voice
TTS_VOICE = "en-GB-SoniaNeural"

2024-09-23 13:50:24,408 - INFO - Using device: cuda
2024-09-23 13:50:24,660 - INFO - Loaded Silero VAD model
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../.cache/torch/whisperx-vad-segmentation.bin`
2024-09-23 13:50:24,994 - INFO - Loaded WhisperX model
2024-09-23 13:50:24,994 - INFO - Set OpenAI API key


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.1+cu121. Bad things might happen unless you revert torch to 1.x.


In [3]:
import torch
import torchaudio
import logging

def check_vad(audio_data, sample_rate):
 logging.info('Checking voice activity')
 # Resample to 16000 Hz if necessary
 target_sample_rate = 16000
 if sample_rate != target_sample_rate:
 resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
 audio_tensor = resampler(torch.from_numpy(audio_data))
 else:
 audio_tensor = torch.from_numpy(audio_data)
 audio_tensor = audio_tensor.to(device)

 # Log audio data details
 logging.info(f'Audio tensor shape: {audio_tensor.shape}, dtype: {audio_tensor.dtype}, device: {audio_tensor.device}')

 # Get speech timestamps with optimized parameters
 speech_timestamps = get_speech_timestamps(
 audio=audio_tensor,
 model=vad_model,
 sampling_rate=target_sample_rate,
 min_speech_duration_ms=250,
 min_silence_duration_ms=80,
 speech_pad_ms=30
 )
 logging.info(f'Found {len(speech_timestamps)} speech timestamps')
 return len(speech_timestamps) > 0

In [4]:
def transcript(audio_data, sample_rate):
 logging.info('Transcribing audio')
 # Resample to 16000 Hz if necessary
 target_sample_rate = 16000
 if sample_rate != target_sample_rate:
 resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
 audio_data = resampler(torch.from_numpy(audio_data)).numpy()
 else:
 audio_data = audio_data

 # Transcribe
 batch_size = 16 # Adjust as needed
 result = whisper_model.transcribe(audio_data, batch_size=batch_size)
 text = result['segments'][0]['text']
 logging.info(f'Transcription result: {text}')
 # Clear GPU memory
 del result
 gc.collect()
 if device == 'cuda':
 torch.cuda.empty_cache()
 return text

In [5]:
from openai import OpenAI

openai_client = OpenAI(api_key='')

def llm(text):
 logging.info('Getting response from OpenAI API')
 response = openai_client.chat.completions.create(
 model="gpt-4o", # Updated to a more recent model
 messages=[
 {"role": "system", "content": "You respond to the following transcript from the conversation that you are having with the user."},
 {"role": "user", "content": text} 
 ],
 stream=True,
 temperature=0.7, # Optional: Adjust as needed
 top_p=0.9, # Optional: Adjust as needed
 )
 for chunk in response:
 yield chunk.choices[0].delta.content

In [7]:
def tts_streaming(text_stream):
 logging.info('Performing TTS')
 buffer = ""
 punctuation = {'.', '!', '?'}
 for text_chunk in text_stream:
 if text_chunk is not None:
 buffer += text_chunk
 # Check for sentence completion
 sentences = []
 start = 0
 for i, char in enumerate(buffer):
 if (char in punctuation):
 sentences.append(buffer[start:i+1].strip())
 start = i+1
 buffer = buffer[start:]

 for sentence in sentences:
 if sentence:
 communicate = edge_tts.Communicate(sentence, TTS_VOICE)
 for chunk in communicate.stream_sync():
 if chunk["type"] == "audio":
 yield chunk["data"]
 # Process any remaining text
 if buffer.strip():
 communicate = edge_tts.Communicate(buffer.strip(), TTS_VOICE)
 for chunk in communicate.stream_sync():
 if chunk["type"] == "audio":
 yield chunk["data"]

In [8]:
# load audio to numpy array
def load_audio(audio_path):
 audio_data, sample_rate = torchaudio.load(audio_path)
 audio_data = audio_data[0].numpy()
 if audio_data.ndim > 1:
 audio_data = np.mean(audio_data, axis=1)
 return audio_data, sample_rate

In [9]:
# Testing the pipeline

# 1. Load audio
audio_path = 'audio.mp3'
audio_data, sample_rate = load_audio(audio_path)

In [10]:
chunk_size = 500 # ms
chunk_size_samples = int(sample_rate * chunk_size / 1000)
chunks = [audio_data[i:i + chunk_size_samples] for i in range(0, len(audio_data), chunk_size_samples)]

# 2. Check voice activity
voice_activity = [check_vad(chunk, sample_rate) for chunk in chunks]

2024-09-23 13:50:49,248 - INFO - Checking voice activity
2024-09-23 13:50:49,253 - INFO - Audio tensor shape: torch.Size([8000]), dtype: torch.float32, device: cuda:0
2024-09-23 13:50:49,494 - INFO - Found 1 speech timestamps
2024-09-23 13:50:49,495 - INFO - Checking voice activity
2024-09-23 13:50:49,498 - INFO - Audio tensor shape: torch.Size([8000]), dtype: torch.float32, device: cuda:0
2024-09-23 13:50:49,506 - INFO - Found 1 speech timestamps
2024-09-23 13:50:49,507 - INFO - Checking voice activity
2024-09-23 13:50:49,511 - INFO - Audio tensor shape: torch.Size([8000]), dtype: torch.float32, device: cuda:0
2024-09-23 13:50:49,518 - INFO - Found 1 speech timestamps
2024-09-23 13:50:49,519 - INFO - Checking voice activity
2024-09-23 13:50:49,523 - INFO - Audio tensor shape: torch.Size([8000]), dtype: torch.float32, device: cuda:0
2024-09-23 13:50:49,531 - INFO - Found 1 speech timestamps
2024-09-23 13:50:49,532 - INFO - Checking voice activity
2024-09-23 13:50:49,535 - INFO - Audio 

In [11]:
text = transcript(audio_data, sample_rate)

2024-09-23 13:50:50,691 - INFO - Transcribing audio


Detected language: en (0.99) in first 30s of audio...


2024-09-23 13:50:51,041 - INFO - Transcription result: What's this the reporter tried to make a hit piece about Wu Kong is not happy. I wonder why? What a shock. Well wait a second. Should we get to the bottom of this?


In [12]:
text = llm(text)
tts_audio = tts_streaming(text)

In [13]:
from IPython.display import Audio
from pydub import AudioSegment
from io import BytesIO
import base64

# Combine audio chunk bytes
audio_bytes = b''.join(tts_audio)

# Play audio
audio_segment = AudioSegment.from_file(BytesIO(audio_bytes), format="raw", frame_rate=16000, channels=1, sample_width=2)

Audio(audio_bytes, rate=16000)

2024-09-23 13:50:53,979 - INFO - Performing TTS
2024-09-23 13:50:53,980 - INFO - Getting response from OpenAI API
2024-09-23 13:50:54,236 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [19]:
np_audio = np.frombuffer(audio_bytes, dtype=np.int16)

# export audio with numpy
np_audio.tofile("output.wav")

In [None]:
# function to process audio input
def process_audio_old(audio, state):
 """
 Flow:
 1. Sleep for 0.5 seconds to allow the audio buffer to accumulate
 2. Check for voice activity
 3. If voice activity is detected and mode is "idle":
 - Set mode to "listening"
 4. If voice activity is detected and mode is "speaking":
 - Stop the llm and tts tasks
 - Set mode to "listening"
 5. If voice activity is detected and mode is "listening":
 - If there's previous_no_vad_audio, add it to chunk_queue
 - Start accumulating audio chunks in chunk_queue
 - If the length of chunk_queue is greater than 3 seconds
 - Get the first 2 seconds of audio from chunk_queue
 - Run transcription on the first 2 seconds
 - Store the transcription in the state
 - Remove the first 2 seconds of audio from chunk_queue
 6. If voice activity is not detected:
 - If mode is "listening" and there's audio in chunk_queue
 - Add the chunk to chunk_queue
 - Set mode to "processing"
 - Run transcription on the leftover audio in chunk_queue
 - Store the transcription in the state
 - Set the mode to "processing"
 - If mode is "processing"
 - Check if there's any leftover audio in chunk_queue
 - If there is, run transcription on the leftover audio
 - Store the transcription in the state
 - Start LLM and TTS in the background
 - Set mode to "responding"
 - If mode is "responding"
 - Get the audio byte chunks from TTS
 - Output the full audio
 - Set mode to "idle"
 - If mode is "idle"
 - do nothing
 
 Ex: Gradio Streaming Audio Example:
 import gradio as gr
 import numpy as np
 import time

 def add_to_stream(audio, instream):
 time.sleep(1)
 if audio is None:
 return gr.update(), instream
 if instream is None:
 ret = audio
 else:
 ret = (audio[0], np.concatenate((instream[1], audio[1])))
 return ret, ret


 with gr.Blocks() as demo:
 inp = gr.Audio(source="microphone")
 out = gr.Audio()
 stream = gr.State()
 clear = gr.Button("Clear")

 inp.stream(add_to_stream, [inp, stream], [out, stream])
 clear.click(lambda: [None, None, None], None, [inp, out, stream])


 if __name__ == "__main__":
 demo.launch()
 """
 """old code:
 time.sleep(0.5)
 if audio is None:
 return None, state

 sample_rate, audio_data = audio
 audio_data = np.array(audio_data, dtype=np.float32)

 # Convert to mono if stereo
 if audio_data.ndim > 1:
 audio_data = np.mean(audio_data, axis=1)

 # Check for voice activity
 vad_result = check_vad(audio_data, sample_rate)
 if vad_result:
 logging.info('Voice activity detected')
 # Voice activity detected
 if state.get("previous_audio_chunk") is not None:
 state["audio_buffer"].append(state["previous_audio_chunk"])
 state["audio_buffer"].append(audio_data)
 state["is_speaking"] = True
 state["previous_audio_chunk"] = audio_data

 # Update total speaking time
 chunk_duration = len(audio_data) / sample_rate
 state["total_speaking_time"] += chunk_duration

 # Start transcription after 3 seconds
 if state["total_speaking_time"] >= 3.0 and not state["transcription_started"]:
 logging.info('Starting transcription')
 # Start transcribing the first 2 seconds
 accumulated_audio = np.concatenate(state["audio_buffer"])
 first_two_seconds_samples = int(2.0 * sample_rate)
 first_two_seconds_audio = accumulated_audio[:first_two_seconds_samples]

 # Transcribe asynchronously
 transcribed_text = transcript(first_two_seconds_audio, sample_rate)
 state["transcription"] += transcribed_text
 state["transcription_started"] = True

 # Start LLM and TTS in the background
 state["llm_task"] = llm_and_tts(state["transcription"], state)
 else:
 if state["is_speaking"]:
 logging.info('Voice activity ended')
 # Voice activity just ended
 # Process the accumulated audio
 full_audio = np.concatenate(state["audio_buffer"])
 # Reset the state
 state["audio_buffer"] = []
 state["is_speaking"] = False
 state["total_speaking_time"] = 0.0
 state["transcription_started"] = False

 # Transcribe the remaining audio
 transcribed_text = transcript(full_audio, sample_rate)
 state["transcription"] += transcribed_text

 # Start LLM and TTS if not already started
 if not state.get("llm_task"):
 state["llm_task"] = llm_and_tts(state["transcription"], state)

 # Check if there's audio to output
 if state.get("tts_audio_chunks"):
 logging.info('Outputting audio')
 # Collect audio chunks
 audio_chunks = state["tts_audio_chunks"]
 state["tts_audio_chunks"] = []
 response_audio = b"".join(audio_chunks)
 np_response_audio = np.frombuffer(response_audio, dtype=np.int16)
 return (sample_rate, np_response_audio), state

 # Collect the last chunk if it exists
 if state.get("previous_audio_chunk") is not None:
 state["audio_buffer"].append(state["previous_audio_chunk"])

 return None, state
 """
 ...


In [None]:
# Function to process audio input
def process_audio_chunk(audio, state):
 if audio is None:
 return None, state
 if state is None:
 state = {
 'mode': 'idle',
 'chunk_queue': [],
 'transcription': '',
 'previous_no_vad_audio': None,
 'tts_audio_chunks': [],
 'llm_task': None,
 'instream': None,
 }

 sample_rate, audio_data = audio
 audio_data = np.array(audio_data, dtype=np.float32)

 # Convert to mono if stereo
 if audio_data.ndim > 1:
 audio_data = np.mean(audio_data, axis=1)

 mode = state['mode']
 chunk_queue = state['chunk_queue']
 transcription = state['transcription']
 previous_no_vad_audio = state['previous_no_vad_audio']
 tts_audio_chunks = state['tts_audio_chunks']
 llm_task = state['llm_task']
 instream = state['instream']

 # Check for voice activity
 vad_result = check_vad(audio_data, sample_rate)

 if vad_result:
 logging.info(f'Voice activity detected in mode: {mode}')
 if mode == 'idle':
 mode = 'listening'
 elif mode == 'speaking':
 # Stop llm and tts tasks
 if llm_task and llm_task.is_alive():
 # Implement task cancellation logic if possible
 logging.info('Stopping LLM and TTS tasks')
 # Since we cannot kill threads directly, we need to handle this in the tasks
 state['stop_signal'] = True
 llm_task.join()
 mode = 'listening'
 
 if vad_result:
 if mode == 'listening':
 if previous_no_vad_audio is not None:
 chunk_queue.append(previous_no_vad_audio)
 previous_no_vad_audio = None
 # Accumulate audio chunks
 chunk_queue.append(audio_data)
 # Calculate the length of chunk_queue in seconds
 total_samples = sum(len(chunk) for chunk in chunk_queue)
 total_duration = total_samples / sample_rate
 if total_duration > 3.0:
 # Get the first 2 seconds of audio
 first_two_seconds_samples = int(2.0 * sample_rate)
 accumulated_audio = np.concatenate(chunk_queue)
 first_two_seconds_audio = accumulated_audio[:first_two_seconds_samples]
 # Run transcription on the first 2 seconds
 transcribed_text = transcript(first_two_seconds_audio, sample_rate)
 transcription += transcribed_text
 # Remove the first 2 seconds from chunk_queue
 remaining_audio = accumulated_audio[first_two_seconds_samples:]
 chunk_queue = [remaining_audio] if len(remaining_audio) > 0 else []
 elif mode == 'speaking':
 # Continue accumulating audio chunks
 chunk_queue.append(audio_data)
 else:
 logging.info(f'No voice activity detected in mode: {mode}')
 if mode == 'listening' and chunk_queue:
 # Add the chunk to chunk_queue
 chunk_queue.append(audio_data)
 # Run transcription on leftover audio in chunk_queue
 accumulated_audio = np.concatenate(chunk_queue)
 transcribed_text = transcript(accumulated_audio, sample_rate)
 transcription += transcribed_text
 # Clear chunk_queue
 chunk_queue = []
 mode = 'processing'
 # Start LLM and TTS in the background
 if not llm_task or not llm_task.is_alive():
 state['stop_signal'] = False
 llm_task = threading.Thread(target=llm_and_tts, args=(transcription, state))
 llm_task.start()
 elif mode == 'processing':
 # Wait for LLM and TTS to finish
 if llm_task and not llm_task.is_alive():
 mode = 'responding'
 elif mode == 'responding':
 # Get the audio byte chunks from TTS
 if tts_audio_chunks:
 logging.info('Outputting audio response')
 # Collect audio chunks
 response_audio = b"".join(tts_audio_chunks)
 np_response_audio = np.frombuffer(response_audio, dtype=np.int16)
 
 if instream is None:
 instream = np_response_audio
 else:
 instream = np.concatenate((instream, np_response_audio))
 
 # Clear tts_audio_chunks
 tts_audio_chunks.clear()
 # Reset transcription for next interaction
 transcription = ''
 # Set mode to "idle"
 mode = 'idle'
 
 # Update state
 state.update({
 'mode': mode,
 'chunk_queue': chunk_queue,
 'transcription': transcription,
 'previous_no_vad_audio': previous_no_vad_audio,
 'tts_audio_chunks': tts_audio_chunks,
 'llm_task': None,
 'instream': instream
 })
 return (sample_rate, instream), state
 elif mode == 'idle':
 # Do nothing
 pass
 else:
 # Store the audio when no VAD is detected
 previous_no_vad_audio = audio_data

 # Update state
 state.update({
 'mode': mode,
 'chunk_queue': chunk_queue,
 'transcription': transcription,
 'previous_no_vad_audio': previous_no_vad_audio,
 'tts_audio_chunks': tts_audio_chunks,
 'llm_task': llm_task,
 'instream': instream
 })

 return None, state

# Initialize the state
initial_state = {
 'mode': 'idle',
 'chunk_queue': [],
 'transcription': '',
 'previous_no_vad_audio': None,
 'tts_audio_chunks': [],
 'llm_task': None,
 'instream': None,
}

# Create Gradio interface
with gr.Blocks() as demo:
 gr.Markdown("## Voice-Activated Transcription and Response System")
 audio_input = gr.Audio(sources="microphone", type="numpy", streaming=True)
 state = gr.State(initial_state)
 audio_output = gr.Audio(label="Response Audio", autoplay=True)
 audio_input.stream(process_audio, [audio_input, state], [audio_output, state])

if __name__ == "__main__":
 logging.info('Launching Gradio interface')
 demo.launch()
