import streamlit as st from transformers import pipeline,AutoFeatureExtractor from PIL import Image import torch # Load Hugging Face token HF_TOKEN = st.secrets["Hf_token"] # Load the model and pipeline model_id = "meta-llama/Llama-3.2-11B-Vision" # Secret Token HF_TOKEN = st.secrets["hf_token"] # Model and Task Configuration MODEL_ID = "meta-llama/Llama-3.2-11B-Vision" # Replace with the correct model ID # Initialize the pipeline @st.cache_resource def initialize_pipeline(): return pipeline( "image-text-to-text", model=MODEL_ID, model_kwargs={"torch_dtype": torch.bfloat16, "use_auth_token": HF_TOKEN} ) # Preprocess image function def preprocess_image(image, size=(224, 224)): """Converts the image to RGB and resizes it to the required dimensions.""" return image.convert("RGB").resize(size) # Streamlit UI st.title("Image and Text to Text Generation") st.write(f"**Using model:** {MODEL_ID}") uploaded_file = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"]) input_text = st.text_input("Enter your text input (optional):") if st.button("Generate"): if uploaded_file: try: # Preprocess image image = Image.open(uploaded_file) preprocessed_image = preprocess_image(image) # Initialize pipeline model_pipeline = initialize_pipeline() # Create inputs for the pipeline inputs = {"images": [preprocessed_image], "text": input_text} # Run the model and get the response response = model_pipeline(**inputs) st.write("Generated Response:") st.write(response) except ValueError as ve: if str(ve) == "The number of image token (0) should be the same as in the number of provided images (1)": st.error("Make sure your image is correctly preprocessed and passed to the model.") else: st.error(f"Error: {ve}") except Exception as e: st.error(f"Error: {e}") else: st.error("Please upload an image to proceed.")