Whisper_dutch / app.py
SevenhuijsenM
Implementation of AI
f47d361
raw
history blame
3.6 kB
from transformers import pipeline
import gradio as gr
from pytube import YouTube
import os
import requests
import time
from openai import OpenAI
client = OpenAI()
pipe = pipeline(model="dussen/whisper-small-nl-hc")
print(pipe)
def download_audio(url, output_path='downloads'):
try:
# Create a YouTube object
yt = YouTube(url)
# Get the audio stream with the highest quality
audio_stream = yt.streams.filter(only_audio=True, file_extension='mp4').first()
audio_stream.download(output_path)
# If a video.mp4 file already exists, delete it
if os.path.exists(f"{output_path}/video.mp4"):
os.remove(f"{output_path}/video.mp4")
# Change the name of the file to video.mp4
default_filename = audio_stream.default_filename
mp4_path = f"{output_path}/{default_filename}"
mp3_path = f"{output_path}/video.mp3"
os.rename(mp4_path, mp3_path)
# Use the model to transcribe the audio
text = pipe(mp3_path)["text"]
# Delete the audio file
os.remove(mp3_path)
return text
except Exception as e:
print(f"Error: {e}")
def audio_to_text(audio):
text = pipe(audio)["text"]
print(text)
return text
def radio_to_text(radio_url):
r = requests.get(radio_url, stream=True)
# Open it and after 10 seconds close the connection
with open('stream.mp3', 'wb') as f:
# Get the stopping time as a UNIX timestamp
stop_after = time.time() + 10
try:
for block in r.iter_content(1024):
f.write(block)
if time.time() > stop_after:
break
except KeyboardInterrupt:
pass
text = pipe("stream.mp3")["text"]
print(text)
# Use chatGPT to summarise the text using a prompt that says whether it is news, an ad or a song
prompt = f"Dit stuk komt uit een radio uitzending en is getranscribeerd door AI. Er kunnen fouten in zitten. Kan je eerst het categorie text geven uit `nieuws`, `muziek`, `advertentie` of rest`, en dan in max drie zinnen wat er gezegd is?{text}"
# Limit the text to 3000 tokens
prompt = prompt[:3584]
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
temperature=0.7,
max_tokens=512,
top_p=1
)
text = f"Tekst van de AI die is getranscribeerd: {text}\n\n---\n\nSamenvatting door AI:\n\n{response}"
return text
iface_video_url = gr.Interface(
fn=download_audio,
inputs="text",
outputs="text",
title="Whisper Small Dutch - Use a YouTube URL",
description="Demo for dutch speech recognition using a fine-tuned Whisper small model.",
)
iface_audio = gr.Interface(
fn=audio_to_text,
inputs=gr.Audio(sources=["microphone"], type="filepath"),
outputs="text",
title="Whisper Small Dutch - Use your microphone",
description="Realtime demo for dutch speech recognition using a fine-tuned Whisper small model.",
)
iface_radio = gr.Interface(
fn=radio_to_text,
inputs="text",
outputs="text",
title="Whisper Small Dutch - Use a radio URL",
description="Demo for dutch speech recognition using a fine-tuned Whisper small model. It gets information on what is playing on the given radio URL. It transcribes it and then summarises it using chatGPT.",
)
app = gr.TabbedInterface([iface_audio, iface_video_url, iface_radio], ["Audio to text", "Video to text", "Radio to text"])
if __name__ == "__main__":
app.launch()