from transformers import pipeline import gradio as gr from pytube import YouTube import os import requests import time from openai import OpenAI client = OpenAI() pipe = pipeline(model="dussen/whisper-small-nl-hc") print(pipe) def download_audio(url, output_path='downloads'): try: # Create a YouTube object yt = YouTube(url) # Get the audio stream with the highest quality audio_stream = yt.streams.filter(only_audio=True, file_extension='mp4').first() audio_stream.download(output_path) # If a video.mp4 file already exists, delete it if os.path.exists(f"{output_path}/video.mp4"): os.remove(f"{output_path}/video.mp4") # Change the name of the file to video.mp4 default_filename = audio_stream.default_filename mp4_path = f"{output_path}/{default_filename}" mp3_path = f"{output_path}/video.mp3" os.rename(mp4_path, mp3_path) # Use the model to transcribe the audio text = pipe(mp3_path)["text"] # Delete the audio file os.remove(mp3_path) return text except Exception as e: print(f"Error: {e}") def audio_to_text(audio): text = pipe(audio)["text"] print(text) return text def radio_to_text(radio_url): r = requests.get(radio_url, stream=True) # Open it and after 10 seconds close the connection with open('stream.mp3', 'wb') as f: # Get the stopping time as a UNIX timestamp stop_after = time.time() + 10 try: for block in r.iter_content(1024): f.write(block) if time.time() > stop_after: break except KeyboardInterrupt: pass text = pipe("stream.mp3")["text"] print(text) # Use chatGPT to summarise the text using a prompt that says whether it is news, an ad or a song prompt = f"Dit stuk komt uit een radio uitzending en is getranscribeerd door AI. Er kunnen fouten in zitten. Kan je eerst het categorie text geven uit `nieuws`, `muziek`, `advertentie` of rest`, en dan in max drie zinnen wat er gezegd is?{text}" # Limit the text to 3000 tokens prompt = prompt[:3584] response = client.chat.completions.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}], temperature=0.7, max_tokens=512, top_p=1 ) text = f"Tekst van de AI die is getranscribeerd: {text}\n\n---\n\nSamenvatting door AI:\n\n{response.choices[0].message.content}" return text iface_video_url = gr.Interface( fn=download_audio, inputs="text", outputs="text", title="Whisper Small Dutch - Use a YouTube URL", description="Demo for dutch speech recognition using a fine-tuned Whisper small model.", ) iface_audio = gr.Interface( fn=audio_to_text, inputs=gr.Audio(sources=["microphone"], type="filepath"), outputs="text", title="Whisper Small Dutch - Use your microphone", description="Realtime demo for dutch speech recognition using a fine-tuned Whisper small model.", ) iface_radio = gr.Interface( fn=radio_to_text, inputs="text", outputs="text", title="Whisper Small Dutch - Use a radio URL", description="Demo for dutch speech recognition using a fine-tuned Whisper small model. It gets information on what is playing on the given radio URL. It transcribes it and then summarises it using chatGPT.", ) app = gr.TabbedInterface([iface_audio, iface_video_url, iface_radio], ["Audio to text", "Video to text", "Radio to text"]) if __name__ == "__main__": app.launch()