import gradio as gr from gradio_client import Client import cv2 from moviepy.editor import * # 1. extract and store 1 image every 5 images from video input # 2. extract audio # 3. for each image from extracted_images, get caption from caption model and concatenate into list # 4. for audio, ask audio questioning model to describe sound/scene # 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption import re import torch from transformers import pipeline zephyr_model = "HuggingFaceH4/zephyr-7b-beta" pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto") standard_sys = f""" You will be provided a list of visual events, and an audio description. All these informations come from a single video. List of visual events are actually extracted from this video every 12 frames. These visual infos are extracted from a video that is usually a short sequenc. As a smart assistant, you must understand that Repetitive visual element of the same person or group of subject means that it is the same person/subject, filmed without cut. For example, if visual elements is like this: "An older man wearing a brown hat and glasses, looking off into the distance. An older man wearing a brown hat and glasses, with a beard and a mustache, is looking directly at the camera. An older man wearing a brown hat and glasses, with a beard and a beard on his chin, is looking at the camera." It does not mean there are 3 older men, but this is the same man. Because we have extracted vere close frame from the video sequence. In the meantme, Audio events are actually the description from the audio of the video. Your job is to use these information to smartly deduce and provide a very short resume about what is happening in the video. """ def extract_frames(video_in, interval=24, output_format='.jpg'): """Extract frames from a video at a specified interval and store them in a list. Args: - video_in: string or path-like object pointing to the video file - interval: integer specifying how many frames apart to extract images (default: 5) - output_format: string indicating desired format for saved images (default: '.jpg') Returns: A list of strings containing paths to saved images. """ # Initialize variables vidcap = cv2.VideoCapture(video_in) frames = [] count = 0 # Loop through frames until there are no more while True: success, image = vidcap.read() # Check if successful read and not past end of video if success: print('Read a new frame:', success) # Save current frame if it meets criteria if count % interval == 0: filename = f'frame_{count // interval}{output_format}' frames.append(filename) cv2.imwrite(filename, image) print(f'Saved {filename}') # Increment counter count += 1 # Break out of loop when done reading frames else: break # Close video capture vidcap.release() print('Done extracting frames!') return frames def process_image(image_in): client = Client("https://vikhyatk-moondream1.hf.space/") result = client.predict( image_in, # filepath in 'image' Image component "Describe precisely the image in one sentence.", # str in 'Question' Textbox component api_name="/answer_question" #api_name="/predict" ) print(result) return result def extract_audio(video_path): video_clip = VideoFileClip(video_path) audio_clip = video_clip.audio audio_clip.write_audiofile("output_audio.mp3") return "output_audio.mp3" def get_salmonn(audio_in): salmonn_prompt = "Please describe the audio" client = Client("fffiloni/SALMONN-7B-gradio") result = client.predict( audio_in, # filepath in 'Audio' Audio component salmonn_prompt, # str in 'User question' Textbox component 4, # float (numeric value between 1 and 10) in 'beam search numbers' Slider component 1, # float (numeric value between 0.8 and 2.0) in 'temperature' Slider component 0.9, # float (numeric value between 0.1 and 1.0) in 'top p' Slider component api_name="/gradio_answer" ) print(result) return result def llm_process(user_prompt): agent_maker_sys = standard_sys instruction = f""" <|system|> {agent_maker_sys} <|user|> """ prompt = f"{instruction.strip()}\n{user_prompt}" outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>' cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL) print(f"SUGGESTED video description: {cleaned_text}") return cleaned_text.lstrip("\n") def infer(video_in): # Extract frames from a video frame_files = extract_frames(video_in) # Process each extracted frame and collect results in a list processed_texts = [] for frame_file in frame_files: text = process_image(frame_file) processed_texts.append(text) print(processed_texts) # Convert processed_texts list to a string list with line breaks string_list = '\n'.join(processed_texts) # Extract audio from video extracted_audio = extract_audio(video_in) print(extracted_audio) # Get description of audio content audio_content_described = get_salmonn(extracted_audio) # Assemble captions formatted_captions = f""" ### Visual events:\n{string_list}\n ### Audio events:\n{audio_content_described} """ print(formatted_captions) # Send formatted captions to LLM video_description_from_llm = llm_process(formatted_captions) return video_description_from_llm with gr.Blocks() as demo : with gr.Column(elem_id="col-container"): gr.HTML("""