ruslanmv commited on
Commit
6d16e49
1 Parent(s): 55d7c93

First commit

Browse files
Files changed (2) hide show
  1. app.py +56 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import torch
4
+ from transformers import AutoProcessor, MllamaForConditionalGeneration
5
+ from PIL import Image
6
+
7
+ # Get your Hugging Face token from environment variables
8
+ HF_TOKEN = os.environ.get('HF_TOKEN')
9
+
10
+ # Load the model and processor
11
+ model_name = "ruslanmv/Llama-3.2-11B-Vision-Instruct"
12
+ model = MllamaForConditionalGeneration.from_pretrained(
13
+ model_name,
14
+ use_auth_token=HF_TOKEN,
15
+ torch_dtype=torch.bfloat16,
16
+ device_map="auto",
17
+ )
18
+ processor = AutoProcessor.from_pretrained(model_name, use_auth_token=HF_TOKEN)
19
+
20
+ def predict(image, text):
21
+ # Prepare the input messages
22
+ messages = [
23
+ {"role": "user", "content": [
24
+ {"type": "image"}, # Specify that an image is provided
25
+ {"type": "text", "text": text} # Add the user-provided text input
26
+ ]}
27
+ ]
28
+
29
+ # Create the input text using the processor's chat template
30
+ input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
31
+
32
+ # Process the inputs and move to the appropriate device
33
+ inputs = processor(image, input_text, return_tensors="pt").to(model.device)
34
+
35
+ # Generate a response from the model
36
+ outputs = model.generate(**inputs, max_new_tokens=100)
37
+
38
+ # Decode the output to return the final response
39
+ response = processor.decode(outputs[0], skip_special_tokens=True)
40
+ return response
41
+
42
+ # Define the Gradio interface
43
+ interface = gr.Interface(
44
+ fn=predict,
45
+ inputs=[
46
+ gr.Image(type="pil", label="Image Input"), # Image input with label
47
+ gr.Textbox(label="Text Input") # Textbox input with label
48
+ ],
49
+ outputs=gr.Textbox(label="Generated Response"), # Output with a more descriptive label
50
+ title="Llama 3.2 11B Vision Instruct Demo", # Title of the interface
51
+ description="This demo uses Meta's Llama 3.2 11B Vision model to generate responses based on an image and text input.", # Short description
52
+ theme="compact" # Using a compact theme for a cleaner look
53
+ )
54
+
55
+ # Launch the interface
56
+ interface.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ git+https://github.com/huggingface/transformers.git
3
+ torch
4
+ accelerate
5
+ bitsandbytes
6
+ sentencepiece
7
+ Pillow