Spaces:
Sleeping
Sleeping
File size: 3,601 Bytes
e96b0df 3aff486 befa5d2 2a6ff40 f47d361 e96b0df 2a6ff40 b932632 3aff486 c1e7ebb 3aff486 c1e7ebb 3aff486 2a6ff40 c1e7ebb 3aff486 e96b0df b932632 e96b0df 167c051 25cb334 167c051 befa5d2 167c051 befa5d2 5238f45 befa5d2 167c051 befa5d2 167c051 2a6ff40 167c051 3aff486 167c051 3aff486 932a9e1 3aff486 167c051 e96b0df 932a9e1 e96b0df 932a9e1 167c051 932a9e1 167c051 2a6ff40 932a9e1 167c051 3aff486 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
from transformers import pipeline
import gradio as gr
from pytube import YouTube
import os
import requests
import time
from openai import OpenAI
client = OpenAI()
pipe = pipeline(model="dussen/whisper-small-nl-hc")
print(pipe)
def download_audio(url, output_path='downloads'):
try:
# Create a YouTube object
yt = YouTube(url)
# Get the audio stream with the highest quality
audio_stream = yt.streams.filter(only_audio=True, file_extension='mp4').first()
audio_stream.download(output_path)
# If a video.mp4 file already exists, delete it
if os.path.exists(f"{output_path}/video.mp4"):
os.remove(f"{output_path}/video.mp4")
# Change the name of the file to video.mp4
default_filename = audio_stream.default_filename
mp4_path = f"{output_path}/{default_filename}"
mp3_path = f"{output_path}/video.mp3"
os.rename(mp4_path, mp3_path)
# Use the model to transcribe the audio
text = pipe(mp3_path)["text"]
# Delete the audio file
os.remove(mp3_path)
return text
except Exception as e:
print(f"Error: {e}")
def audio_to_text(audio):
text = pipe(audio)["text"]
print(text)
return text
def radio_to_text(radio_url):
r = requests.get(radio_url, stream=True)
# Open it and after 10 seconds close the connection
with open('stream.mp3', 'wb') as f:
# Get the stopping time as a UNIX timestamp
stop_after = time.time() + 10
try:
for block in r.iter_content(1024):
f.write(block)
if time.time() > stop_after:
break
except KeyboardInterrupt:
pass
text = pipe("stream.mp3")["text"]
print(text)
# Use chatGPT to summarise the text using a prompt that says whether it is news, an ad or a song
prompt = f"Dit stuk komt uit een radio uitzending en is getranscribeerd door AI. Er kunnen fouten in zitten. Kan je eerst het categorie text geven uit `nieuws`, `muziek`, `advertentie` of rest`, en dan in max drie zinnen wat er gezegd is?{text}"
# Limit the text to 3000 tokens
prompt = prompt[:3584]
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
temperature=0.7,
max_tokens=512,
top_p=1
)
text = f"Tekst van de AI die is getranscribeerd: {text}\n\n---\n\nSamenvatting door AI:\n\n{response}"
return text
iface_video_url = gr.Interface(
fn=download_audio,
inputs="text",
outputs="text",
title="Whisper Small Dutch - Use a YouTube URL",
description="Demo for dutch speech recognition using a fine-tuned Whisper small model.",
)
iface_audio = gr.Interface(
fn=audio_to_text,
inputs=gr.Audio(sources=["microphone"], type="filepath"),
outputs="text",
title="Whisper Small Dutch - Use your microphone",
description="Realtime demo for dutch speech recognition using a fine-tuned Whisper small model.",
)
iface_radio = gr.Interface(
fn=radio_to_text,
inputs="text",
outputs="text",
title="Whisper Small Dutch - Use a radio URL",
description="Demo for dutch speech recognition using a fine-tuned Whisper small model. It gets information on what is playing on the given radio URL. It transcribes it and then summarises it using chatGPT.",
)
app = gr.TabbedInterface([iface_audio, iface_video_url, iface_radio], ["Audio to text", "Video to text", "Radio to text"])
if __name__ == "__main__":
app.launch()
|