Spaces:
Sleeping
Sleeping
File size: 2,038 Bytes
5fd1d62 bfe569d 5fd1d62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import gradio as gr
import numpy as np
import torch
import whisper
from moviepy.editor import *
from moviepy.video.VideoClip import TextClip
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("base", device=DEVICE)
def generate_video(audio_path, language):
# Transcribe audio
result = model.transcribe(audio_path, language=language)
# Prepare video clips from transcription segments
clips = []
for segment in result["segments"]:
text_clip = (
TextClip(
segment["text"],
fontsize=24,
font="Arial",
color="white",
bg_color="black",
size=(1280, 720),
)
.set_duration(segment["end"] - segment["start"])
.set_start(segment["start"])
)
clips.append(text_clip)
# Concatenate clips and set audio
video = concatenate_videoclips(clips, method="compose")
video = video.set_audio(AudioFileClip(audio_path))
# Export video to a buffer
output_path = "./transcribed_video.mp4"
video.write_videofile(output_path, fps=6, codec="libx264", audio_codec="aac")
return output_path
if __name__ == "__main__":
print(
f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)
# Gradio interface
iface = gr.Interface(
fn=generate_video,
inputs=[
gr.Audio(
sources=["upload", "microphone"], type="filepath", label="Audio File"
),
gr.Dropdown(
["en", "es", "fr", "de", "it", "nl", "ru", "zh"],
label="Language",
),
],
outputs=gr.Video(label="Play Video", show_download_button=True),
title="Audio Transcription Video Generator",
description="Upload your audio file and select the language for transcription.",
)
iface.launch()
|