File size: 2,770 Bytes
e96b0df
 
3aff486
 
e96b0df
 
b932632
3aff486
 
 
 
 
 
 
 
c1e7ebb
 
 
 
 
3aff486
c1e7ebb
 
 
 
 
 
3aff486
 
c1e7ebb
 
3aff486
c1e7ebb
b932632
3aff486
 
c1e7ebb
 
 
3aff486
 
 
 
 
 
e96b0df
b932632
e96b0df
 
167c051
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3aff486
167c051
3aff486
 
932a9e1
3aff486
 
 
 
167c051
e96b0df
 
932a9e1
e96b0df
 
 
932a9e1
167c051
 
932a9e1
167c051
 
932a9e1
 
167c051
3aff486
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from transformers import pipeline
import gradio as gr
from pytube import YouTube
import os

pipe = pipeline(model="dussen/whisper-small-nl-hc")
print(pipe)
def download_audio(url, output_path='downloads'):
    try:
        # Create a YouTube object
        yt = YouTube(url)

        # Get the audio stream with the highest quality
        audio_stream = yt.streams.filter(only_audio=True, file_extension='mp4').first()
        audio_stream.download(output_path)
        print(f"Downloaded audio to {output_path}")

        # If a video.mp4 file already exists, delete it
        if os.path.exists(f"{output_path}/video.mp4"):
            os.remove(f"{output_path}/video.mp4")

        print("Downloading video...")

        # Change the name of the file to video.mp4
        default_filename = audio_stream.default_filename
        mp4_path = f"{output_path}/{default_filename}"
        mp3_path = f"{output_path}/video.mp3"
        os.rename(mp4_path, mp3_path)

        print("Downloaded video")


        print("Transcribing audio...")
        print("Type of audio: ", type(mp3_path))
        # Use the model to transcribe the audio
        text = pipe(mp3_path)["text"]
        print(f"Transcribed audio: {text}")
        # Delete the audio file
        os.remove(mp3_path)

        return text
    except Exception as e:
        print(f"Error: {e}")

def audio_to_text(audio):
    text = pipe(audio)["text"]
    print(text)
    return text

def radio_to_text(radio_url):
    # A radio livestream
    stream_url = radio_url

    r = requests.get(stream_url, stream=True)

    with open('stream.mp3', 'wb') as f:
        try:
            for block in r.iter_content(1024):
                f.write(block)
        except KeyboardInterrupt:
            pass

    text = pipe("stream.mp3")["text"]
    print(text)
    return text

iface_video_url = gr.Interface(
    fn=download_audio,
    inputs="text",
    outputs="text",
    title="Whisper Small Dutch - Use a YouTube URL",
    description="Demo for dutch speech recognition using a fine-tuned Whisper small model.",
)

iface_audio = gr.Interface(
    fn=audio_to_text,
    inputs=gr.Audio(sources=["microphone"], type="filepath"),
    outputs="text",
    title="Whisper Small Dutch - Use your microphone",
    description="Realtime demo for dutch speech recognition using a fine-tuned Whisper small model.",
)

iface_radio = gr.Interface(
    fn=radio_to_text,
    inputs="text",
    outputs="text",
    title="Whisper Small Dutch - Use a radio URL",
    description="Demo for dutch speech recognition using a fine-tuned Whisper small model.",
)

app = gr.TabbedInterface([iface_audio, iface_video_url, iface_radio], ["Audio to text", "Video to text", "Radio to text"])

if __name__ == "__main__":
    app.launch()