Madhuri123 commited on
Commit
aadd242
·
verified ·
1 Parent(s): f67dd23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -22
app.py CHANGED
@@ -1,36 +1,52 @@
1
  import streamlit as st
2
- import transformers
3
- import torch
4
- import requests
5
  from PIL import Image
6
- from transformers import MllamaForConditionalGeneration, AutoProcessor
7
- import subprocess
8
- subprocess.run(["pip", "install", "accelerate>=0.26.0"])
9
 
 
 
10
 
11
- HF_TOKEN=st.secrets["hf_token"]
12
  # Load the model and pipeline
13
  model_id = "meta-llama/Llama-3.2-11B-Vision"
14
- # Streamlit user interface
15
- st.title("LLM Model Inference")
16
- st.write(f"**Using model:** {model_id}")
17
 
18
- # Set up the pipeline with the Hugging Face token
19
- model = MllamaForConditionalGeneration.from_pretrained(
20
- model_id,
21
- torch_dtype=torch.bfloat16,
22
- device_map="auto",
23
  )
24
- processor = AutoProcessor.from_pretrained(model_id)
25
 
26
- url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
27
- image = Image.open(requests.get(url, stream=True).raw)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
30
- inputs = processor(image, prompt, return_tensors="pt").to(model.device)
31
 
32
- output = model.generate(**inputs, max_new_tokens=30)
33
- st.write(processor.decode(output[0]))
34
 
35
 
36
 
 
1
  import streamlit as st
2
+ from transformers import pipeline
 
 
3
  from PIL import Image
4
+ import torch
 
 
5
 
6
+ # Load Hugging Face token
7
+ HF_TOKEN = st.secrets["hf_token"]
8
 
 
9
  # Load the model and pipeline
10
  model_id = "meta-llama/Llama-3.2-11B-Vision"
 
 
 
11
 
12
+ # Initialize pipeline
13
+ pipeline = pipeline(
14
+ "text-to-image-and-text", # Hypothetical task name for multimodal processing
15
+ model=model_id,
16
+ model_kwargs={"torch_dtype": torch.bfloat16, "use_auth_token": HF_TOKEN}
17
  )
 
18
 
19
+ # Streamlit UI
20
+ st.title("Multimodal LLM Inference")
21
+ st.write(f"**Using model:** {model_id}")
22
+
23
+ # Text Input
24
+ input_text = st.text_input("Enter your prompt:")
25
+
26
+ # Image Input
27
+ uploaded_file = st.file_uploader("Upload an image:", type=["jpg", "png", "jpeg"])
28
+
29
+ if st.button("Generate"):
30
+ if input_text and uploaded_file:
31
+ # Process image
32
+ image = Image.open(uploaded_file)
33
+
34
+ # Prepare multimodal input
35
+ messages = [
36
+ {"role": "system", "content": "You are a multimodal assistant."},
37
+ {"role": "user", "content": input_text, "image": image}
38
+ ]
39
+
40
+ # Generate response
41
+ response = pipeline(messages, max_new_tokens=30)
42
+
43
+ # Display results
44
+ st.write("Generated Response:")
45
+ st.write(response[0]['generated_text'][-1]['content']) # Assuming this structure
46
+ else:
47
+ st.error("Please enter a prompt and upload an image.")
48
 
 
 
49
 
 
 
50
 
51
 
52