from transformers import pipeline
import gradio as gr
from pytube import YouTube
import os
import requests
import time
from openai import OpenAI
client = OpenAI()

pipe = pipeline(model="dussen/whisper-small-nl-hc")

print(pipe)
def download_audio(url, output_path='downloads'):
    try:
        # Create a YouTube object
        yt = YouTube(url)

        # Get the audio stream with the highest quality
        audio_stream = yt.streams.filter(only_audio=True, file_extension='mp4').first()
        audio_stream.download(output_path)

        # If a video.mp4 file already exists, delete it
        if os.path.exists(f"{output_path}/video.mp4"):
            os.remove(f"{output_path}/video.mp4")

        # Change the name of the file to video.mp4
        default_filename = audio_stream.default_filename
        mp4_path = f"{output_path}/{default_filename}"
        mp3_path = f"{output_path}/video.mp3"
        os.rename(mp4_path, mp3_path)

        # Use the model to transcribe the audio
        text = pipe(mp3_path)["text"]

        # Delete the audio file
        os.remove(mp3_path)

        return text
    except Exception as e:
        print(f"Error: {e}")

def audio_to_text(audio):
    text = pipe(audio)["text"]
    print(text)
    return text

def radio_to_text(radio_url):
    r = requests.get(radio_url, stream=True)

    # Open it and after 10 seconds close the connection
    with open('stream.mp3', 'wb') as f:
        # Get the stopping time as a UNIX timestamp
        stop_after = time.time() + 10

        try:
            for block in r.iter_content(1024):
                f.write(block)
                if time.time() > stop_after:
                    break
        except KeyboardInterrupt:
            pass
    text = pipe("stream.mp3")["text"]
    print(text)

    # Use chatGPT to summarise the text using a prompt that says whether it is news, an ad or a song
    prompt = f"Dit stuk komt uit een radio uitzending en is getranscribeerd door AI. Er kunnen fouten in zitten. Kan je eerst het categorie text geven uit `nieuws`, `muziek`, `advertentie` of rest`, en dan in max drie zinnen wat er gezegd is?{text}"

    # Limit the text to 3000 tokens
    prompt = prompt[:3584]

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_tokens=512,
        top_p=1
        )
    text = f"Tekst van de AI die is getranscribeerd: {text}\n\n---\n\nSamenvatting door AI:\n\n{response.choices[0].message.content}"

    return text

iface_video_url = gr.Interface(
    fn=download_audio,
    inputs="text",
    outputs="text",
    title="Whisper Small Dutch - Use a YouTube URL",
    description="Demo for dutch speech recognition using a fine-tuned Whisper small model.",
)

iface_audio = gr.Interface(
    fn=audio_to_text,
    inputs=gr.Audio(sources=["microphone"], type="filepath"),
    outputs="text",
    title="Whisper Small Dutch - Use your microphone",
    description="Realtime demo for dutch speech recognition using a fine-tuned Whisper small model.",
)

iface_radio = gr.Interface(
    fn=radio_to_text,
    inputs="text",
    outputs="text",
    title="Whisper Small Dutch - Use a radio URL",
    description="Demo for dutch speech recognition using a fine-tuned Whisper small model. It gets information on what is playing on the given radio URL. It transcribes it and then summarises it using chatGPT.",
)

app = gr.TabbedInterface([iface_audio, iface_video_url, iface_radio], ["Audio to text", "Video to text", "Radio to text"])

if __name__ == "__main__":
    app.launch()