import gradio as gr import openai import os import json import numpy as np import torch from transformers import AutoProcessor, AutoModelForCausalLM openai.organization = os.getenv("API_ORG") openai.api_key = os.getenv("API_KEY") app_password = os.getenv("APP_PASSWORD") app_username = os.getenv("APP_USERNAME") checkpoint = "openai/clip-vit-base-patch32" processor = AutoProcessor.from_pretrained(checkpoint) model = AutoModelForCausalLM.from_pretrained(checkpoint) def generate(input_image): device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor(images=input_image, return_tensors="pt").to(device) pixel_values = inputs.pixel_values generated_ids = model.generate(pixel_values=pixel_values, max_length=50) generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return generated_caption demo = gr.Interface( fn=generate, inputs=gr.Image(label="Input", elem_id="input_image", type="pil"), outputs=gr.Text(label="Generated Caption"), flagging_options=[], ) demo.launch(share=False, auth=(app_username, app_password))