File size: 3,601 Bytes
e96b0df
 
3aff486
 
befa5d2
 
2a6ff40
f47d361
e96b0df
 
2a6ff40
b932632
3aff486
 
 
 
 
 
 
 
c1e7ebb
 
 
 
3aff486
c1e7ebb
 
 
 
3aff486
 
 
 
2a6ff40
c1e7ebb
 
3aff486
 
 
 
 
 
e96b0df
b932632
e96b0df
 
167c051
25cb334
167c051
befa5d2
167c051
befa5d2
5238f45
befa5d2
167c051
 
 
befa5d2
 
167c051
 
 
 
2a6ff40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167c051
 
3aff486
167c051
3aff486
 
932a9e1
3aff486
 
 
 
167c051
e96b0df
 
932a9e1
e96b0df
 
 
932a9e1
167c051
 
932a9e1
167c051
2a6ff40
932a9e1
 
167c051
3aff486
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from transformers import pipeline
import gradio as gr
from pytube import YouTube
import os
import requests
import time
from openai import OpenAI
client = OpenAI()

pipe = pipeline(model="dussen/whisper-small-nl-hc")

print(pipe)
def download_audio(url, output_path='downloads'):
    try:
        # Create a YouTube object
        yt = YouTube(url)

        # Get the audio stream with the highest quality
        audio_stream = yt.streams.filter(only_audio=True, file_extension='mp4').first()
        audio_stream.download(output_path)

        # If a video.mp4 file already exists, delete it
        if os.path.exists(f"{output_path}/video.mp4"):
            os.remove(f"{output_path}/video.mp4")

        # Change the name of the file to video.mp4
        default_filename = audio_stream.default_filename
        mp4_path = f"{output_path}/{default_filename}"
        mp3_path = f"{output_path}/video.mp3"
        os.rename(mp4_path, mp3_path)

        # Use the model to transcribe the audio
        text = pipe(mp3_path)["text"]

        # Delete the audio file
        os.remove(mp3_path)

        return text
    except Exception as e:
        print(f"Error: {e}")

def audio_to_text(audio):
    text = pipe(audio)["text"]
    print(text)
    return text

def radio_to_text(radio_url):
    r = requests.get(radio_url, stream=True)

    # Open it and after 10 seconds close the connection
    with open('stream.mp3', 'wb') as f:
        # Get the stopping time as a UNIX timestamp
        stop_after = time.time() + 10

        try:
            for block in r.iter_content(1024):
                f.write(block)
                if time.time() > stop_after:
                    break
        except KeyboardInterrupt:
            pass
    text = pipe("stream.mp3")["text"]
    print(text)

    # Use chatGPT to summarise the text using a prompt that says whether it is news, an ad or a song
    prompt = f"Dit stuk komt uit een radio uitzending en is getranscribeerd door AI. Er kunnen fouten in zitten. Kan je eerst het categorie text geven uit `nieuws`, `muziek`, `advertentie` of rest`, en dan in max drie zinnen wat er gezegd is?{text}"

    # Limit the text to 3000 tokens
    prompt = prompt[:3584]

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_tokens=512,
        top_p=1
        )
    text = f"Tekst van de AI die is getranscribeerd: {text}\n\n---\n\nSamenvatting door AI:\n\n{response}"

    return text

iface_video_url = gr.Interface(
    fn=download_audio,
    inputs="text",
    outputs="text",
    title="Whisper Small Dutch - Use a YouTube URL",
    description="Demo for dutch speech recognition using a fine-tuned Whisper small model.",
)

iface_audio = gr.Interface(
    fn=audio_to_text,
    inputs=gr.Audio(sources=["microphone"], type="filepath"),
    outputs="text",
    title="Whisper Small Dutch - Use your microphone",
    description="Realtime demo for dutch speech recognition using a fine-tuned Whisper small model.",
)

iface_radio = gr.Interface(
    fn=radio_to_text,
    inputs="text",
    outputs="text",
    title="Whisper Small Dutch - Use a radio URL",
    description="Demo for dutch speech recognition using a fine-tuned Whisper small model. It gets information on what is playing on the given radio URL. It transcribes it and then summarises it using chatGPT.",
)

app = gr.TabbedInterface([iface_audio, iface_video_url, iface_radio], ["Audio to text", "Video to text", "Radio to text"])

if __name__ == "__main__":
    app.launch()