from decord import VideoReader
import torch
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
import gradio as gr

device = "cuda" if torch.cuda.is_available() else "cpu"

# load pretrained processor, tokenizer, and model
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = VisionEncoderDecoderModel.from_pretrained(
    "Neleac/timesformer-gpt2-video-captioning"
).to(device)


with gr.Blocks() as demo:
    demo.title = "Semantic Summarization of Videos using DLSG"
    gr.Markdown('# Semantic Summarization of Videos using DLSG, Demo by Batch_B29')
    with gr.Row():
        with gr.Column(scale=2):
            video = gr.Video(label="Upload Video", format="mp4")
            generate = gr.Button(value="Generate Caption")
        with gr.Column(scale=1):
            text = gr.Textbox(label="Caption", placeholder="Caption will appear here")
            with gr.Accordion("Settings", open=True):
                with gr.Row():
                    max_length = gr.Slider(
                        label="Max Length", minimum=10, maximum=100, value=20, step=1
                    )
                    min_length = gr.Slider(
                        label="Min Length", minimum=1, maximum=10, value=10, step=1
                    )

        def generate_caption(video, max_length, min_length):
            # read video
            throughputs=1
            container = VideoReader(video)
            clip_len = model.config.encoder.num_frames
            frames = container.get_batch(
                range(0, len(container), len(container) // (througputs * clip_len))
            ).asnumpy()
            frames = [frame for frame in frames[:-1]]

            # process frames
            # generate caption
            gen_kwargs = {
                "min_length": min_length,
                "max_length": max_length,
            }
            pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to(
                device
            )
            tokens = model.generate(pixel_values, **gen_kwargs)
            caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
            return caption

        generate.click(
            generate_caption,
            inputs=[video, max_length, min_length],
            outputs=text,
        )

if __name__ == "__main__":
    demo.launch()