from decord import VideoReader import torch from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel import gradio as gr device = "cuda" if torch.cuda.is_available() else "cpu" # load pretrained processor, tokenizer, and model image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base") tokenizer = AutoTokenizer.from_pretrained("gpt2") model = VisionEncoderDecoderModel.from_pretrained( "Neleac/timesformer-gpt2-video-captioning" ).to(device) with gr.Blocks() as demo: demo.title = "Semantic Summarization of Videos using DLSG" gr.Markdown('# Semantic Summarization of Videos using DLSG, Demo by Batch_B29') with gr.Row(): with gr.Column(scale=2): video = gr.Video(label="Upload Video", format="mp4") generate = gr.Button(value="Generate Caption") with gr.Column(scale=1): text = gr.Textbox(label="Caption", placeholder="Caption will appear here") with gr.Accordion("Settings", open=True): with gr.Row(): max_length = gr.Slider( label="Max Length", minimum=10, maximum=100, value=20, step=1 ) min_length = gr.Slider( label="Min Length", minimum=1, maximum=10, value=10, step=1 ) def generate_caption(video, max_length, min_length): # read video throughputs=1 container = VideoReader(video) clip_len = model.config.encoder.num_frames frames = container.get_batch( range(0, len(container), len(container) // (througputs * clip_len)) ).asnumpy() frames = [frame for frame in frames[:-1]] # process frames # generate caption gen_kwargs = { "min_length": min_length, "max_length": max_length, } pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to( device ) tokens = model.generate(pixel_values, **gen_kwargs) caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0] return caption generate.click( generate_caption, inputs=[video, max_length, min_length], outputs=text, ) if __name__ == "__main__": demo.launch()