Spaces:
Running
Running
# Copyright 2024 Ronan Le Meillat | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# Import necessary libraries | |
import gradio as gr | |
from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils | |
import torch | |
import spaces | |
# Determine the device (GPU or CPU) to run the model on | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
print(f"Using device: {device}") # Log the device being used | |
# Define the model ID and base model path | |
model_id = "eltorio/IDEFICS3_ROCO" | |
base_model_path = "HuggingFaceM4/Idefics3-8B-Llama3" # or change to local path | |
# Initialize the processor from the base model path | |
processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True) | |
# Initialize the model from the base model path and set the torch dtype to bfloat16 | |
model = Idefics3ForConditionalGeneration.from_pretrained( | |
base_model_path, torch_dtype=torch.bfloat16 | |
).to(device) # Move the model to the specified device | |
# Load the adapter from the model ID and automatically map it to the device | |
model.load_adapter(model_id, device_map="auto") | |
# Define a function to infer a description from an image | |
def infere(image): | |
""" | |
Generate a description of a medical image. | |
Args: | |
- image (PIL Image): The medical image to describe. | |
Returns: | |
- generated_texts (List[str]): A list containing the generated description. | |
""" | |
# Define a chat template for the model to respond to | |
messages = [ | |
{ | |
"role": "system", | |
"content": [ | |
{"type": "text", "text": "You are a valuable medical doctor and you are looking at an image of your patient."}, | |
] | |
}, | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "image"}, | |
{"type": "text", "text": "What do we see in this image?"}, | |
] | |
}, | |
] | |
# Apply the chat template and add a generation prompt | |
prompt = processor.apply_chat_template(messages, add_generation_prompt=True) | |
# Preprocess the input image and text | |
inputs = processor(text=prompt, images=[image], return_tensors="pt") | |
# Move the inputs to the specified device | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
# Generate a description with the model | |
generated_ids = model.generate(**inputs, max_new_tokens=100) | |
# Decode the generated IDs into text | |
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) | |
return generated_texts | |
# Define the title, description, and device description for the Gradio interface | |
title = f"<a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO</a>: Medical Image to Text <b>running on {device}</b>" | |
desc = "This model generates a description of a medical image.<br><b>Note: No affiliation with original author. This is a ZeroGPU-enabled duplicate of <a href='https://huggingface.co/spaces/eltorio/IDEFICS3_ROCO'>spaces/eltorio/IDEFICS3_ROCO</a> to support accelerated inference. Please direct your citations and likes to the original work.</b>" | |
device_desc = f"This model is running on {device} 🚀." if device == 'cuda' else f"🐢 This model is running on {device} it will be very (very) slow. If you can donate some GPU time it will be usable 🐢. <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO/discussions'>Please contact us.</a>" | |
# Define the long description for the Gradio interface | |
long_desc = f"This demo is based on the <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO model</a>, which is a multimodal model that can generate text from images. It has been fine-tuned on <a href='https://huggingface.co/datasets/eltorio/ROCO-radiology'>eltorio/ROCO-radiology</a> a dataset of medical images and can generate descriptions of medical images. Try uploading an image of a medical image and see what the model generates!<br><b>{device_desc}</b><br> 2024 - Ronan Le Meillat" | |
# Create a Gradio interface with the infere function and specified title and descriptions | |
radiotest = gr.Interface(fn=infere, inputs="image", outputs="text", title=title, | |
description=desc, article=long_desc) | |
# Launch the Gradio interface and share it | |
radiotest.launch() |