Whisper_dutch / app.py
SevenhuijsenM
Attempt for radio
165d735
from transformers import pipeline
import gradio as gr
from pytube import YouTube
import os
import requests
import time
from openai import OpenAI
client = OpenAI()
pipe = pipeline(model="dussen/whisper-small-nl-hc")
print(pipe)
def download_audio(url, output_path='downloads'):
try:
# Create a YouTube object
yt = YouTube(url)
# Get the audio stream with the highest quality
audio_stream = yt.streams.filter(only_audio=True, file_extension='mp4').first()
audio_stream.download(output_path)
# If a video.mp4 file already exists, delete it
if os.path.exists(f"{output_path}/video.mp4"):
os.remove(f"{output_path}/video.mp4")
# Change the name of the file to video.mp4
default_filename = audio_stream.default_filename
mp4_path = f"{output_path}/{default_filename}"
mp3_path = f"{output_path}/video.mp3"
os.rename(mp4_path, mp3_path)
# Use the model to transcribe the audio
text = pipe(mp3_path)["text"]
# Delete the audio file
os.remove(mp3_path)
return text
except Exception as e:
print(f"Error: {e}")
def audio_to_text(audio):
text = pipe(audio)["text"]
print(text)
return text
def radio_to_text(radio_url):
r = requests.get(radio_url, stream=True)
# Open it and after 10 seconds close the connection
with open('stream.mp3', 'wb') as f:
# Get the stopping time as a UNIX timestamp
stop_after = time.time() + 10
try:
for block in r.iter_content(1024):
f.write(block)
if time.time() > stop_after:
break
except KeyboardInterrupt:
pass
text = pipe("stream.mp3")["text"]
print(text)
# Use chatGPT to summarise the text using a prompt that says whether it is news, an ad or a song
prompt = f"Dit stuk komt uit een radio uitzending en is getranscribeerd door AI. Er kunnen fouten in zitten. Kan je eerst het categorie text geven uit `nieuws`, `muziek`, `advertentie` of rest`, en dan in max drie zinnen wat er gezegd is?{text}"
# Limit the text to 3000 tokens
prompt = prompt[:3584]
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
temperature=0.7,
max_tokens=512,
top_p=1
)
text = f"Tekst van de AI die is getranscribeerd: {text}\n\n---\n\nSamenvatting door AI:\n\n{response.choices[0].message.content}"
return text
iface_video_url = gr.Interface(
fn=download_audio,
inputs="text",
outputs="text",
title="Whisper Small Dutch - Use a YouTube URL",
description="Demo for dutch speech recognition using a fine-tuned Whisper small model.",
)
iface_audio = gr.Interface(
fn=audio_to_text,
inputs=gr.Audio(sources=["microphone"], type="filepath"),
outputs="text",
title="Whisper Small Dutch - Use your microphone",
description="Realtime demo for dutch speech recognition using a fine-tuned Whisper small model.",
)
iface_radio = gr.Interface(
fn=radio_to_text,
inputs="text",
outputs="text",
title="Whisper Small Dutch - Use a radio URL",
description="Demo for dutch speech recognition using a fine-tuned Whisper small model. It gets information on what is playing on the given radio URL. It transcribes it and then summarises it using chatGPT.",
)
app = gr.TabbedInterface([iface_audio, iface_video_url, iface_radio], ["Audio to text", "Video to text", "Radio to text"])
if __name__ == "__main__":
app.launch()