Spaces:
Paused
Paused
import os, sys, re | |
import shutil | |
import argparse | |
import subprocess | |
import soundfile | |
from process_audio import segment_audio | |
from write_srt import write_to_file | |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2Tokenizer | |
import torch | |
import gradio as gr | |
model = "facebook/wav2vec2-large-960h-lv60-self" | |
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model) | |
asr_model = Wav2Vec2ForCTC.from_pretrained(model)#.to('cuda') | |
# Line count for SRT file | |
line_count = 0 | |
def sort_alphanumeric(data): | |
convert = lambda text: int(text) if text.isdigit() else text.lower() | |
alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] | |
return sorted(data, key = alphanum_key) | |
def transcribe_audio(tokenizer, asr_model, audio_file, file_handle): | |
# Run Wav2Vec2.0 inference on each audio file generated after VAD segmentation. | |
global line_count | |
speech, rate = soundfile.read(audio_file) | |
input_values = tokenizer(speech, sampling_rate=16000, return_tensors = "pt", padding='longest').input_values | |
logits = asr_model(input_values).logits | |
prediction = torch.argmax(logits, dim = -1) | |
infered_text = tokenizer.batch_decode(prediction)[0].lower() | |
infered_text = re.sub(r' ', ' ', infered_text) | |
infered_text = re.sub(r'\bi\s', 'I ', infered_text) | |
infered_text = re.sub(r'\si$', ' I', infered_text) | |
infered_text = re.sub(r'i\'', 'I\'', infered_text) | |
limits = audio_file.split(os.sep)[-1][:-4].split("_")[-1].split("-") | |
if len(infered_text) > 1: | |
line_count += 1 | |
write_to_file(file_handle, infered_text, line_count, limits) | |
def get_subs(input_file): | |
# Get directory for audio | |
base_directory = os.getcwd() | |
audio_directory = os.path.join(base_directory, "audio") | |
if os.path.isdir(audio_directory): | |
shutil.rmtree(audio_directory) | |
os.mkdir(audio_directory) | |
# Extract audio from video file | |
video_file = input_file | |
audio_file = audio_directory+'/temp.wav' | |
command = ["ffmpeg", "-i", video_file, "-ac", "1", "-ar", "16000","-vn", "-f", "wav", audio_file] | |
subprocess.run(command) | |
video_file = input_file.split('/')[-1][:-4] | |
srt_file_name = os.path.join(video_file + ".srt") | |
# Split audio file based on VAD silent segments | |
segment_audio(audio_file) | |
os.remove(audio_file) | |
# Output SRT file | |
file_handle = open(srt_file_name, "a+") | |
file_handle.seek(0) | |
for file in sort_alphanumeric(os.listdir(audio_directory)): | |
audio_segment_path = os.path.join(audio_directory, file) | |
if audio_segment_path.split(os.sep)[-1] != audio_file.split(os.sep)[-1]: | |
transcribe_audio(tokenizer, asr_model, audio_segment_path, file_handle) | |
file_handle.close() | |
shutil.rmtree(audio_directory) | |
return srt_file_name | |
gradio_ui = gr.Interface( | |
fn=get_subs, | |
title="Video to Subtitle", | |
description="Get subtitles (SRT file) for your videos. Inference speed is about 10s/per 1min of video BUT the speed of uploading your video depends on your internet connection.", | |
inputs=gr.inputs.Video(label="Upload Video File"), | |
outputs=gr.outputs.File(label="Auto-Transcript") | |
) | |
gradio_ui.launch(inline=False) | |