kiddobellamy
/

Llama_Vision

Video-Text-to-Text

text-generation

Inference Endpoints

Model card Files Files and versions Community

kiddobellamy commited on Sep 28

Commit

6cf0252

•

1 Parent(s): af217f3

Update handler.py

Files changed (1) hide show

handler.py +24 -13

handler.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-from transformers import LlamaForCausalLM, AutoProcessor
 from PIL import Image
 import base64
 import io
@@ -7,11 +7,17 @@ import io
 # Load model and processor globally
 model_id = "kiddobellamy/Llama_Vision"
 model = LlamaForCausalLM.from_pretrained(
     model_id,
-    torch_dtype=torch.bfloat16,
     device_map="auto",
 )
 processor = AutoProcessor.from_pretrained(model_id)
 def handler(event, context):
@@ -28,24 +34,29 @@ def handler(event, context):
         image_bytes = base64.b64decode(image_base64)
         image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
-        # Prepare the message
-        messages = [
-            {"role": "user", "content": [
-                {"type": "image"},
-                {"type": "text", "text": prompt}
-            ]}
-        ]
-        input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
-        # Process inputs
-        inputs = processor(image, input_text, return_tensors="pt").to(model.device)
         # Generate output
         output_ids = model.generate(**inputs, max_new_tokens=50)
-        generated_text = processor.decode(output_ids[0], skip_special_tokens=True)
         # Return the result
         return {'generated_text': generated_text}
     except Exception as e:
         return {'error': str(e)}

 import torch
+from transformers import LlamaForCausalLM, AutoTokenizer, AutoProcessor
 from PIL import Image
 import base64
 import io
 # Load model and processor globally
 model_id = "kiddobellamy/Llama_Vision"
+# Load the model
 model = LlamaForCausalLM.from_pretrained(
     model_id,
+    torch_dtype=torch.float16,  # Use torch.float16 if bfloat16 is not supported
     device_map="auto",
 )
+# Load the tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Load the processor if needed (for image processing)
 processor = AutoProcessor.from_pretrained(model_id)
 def handler(event, context):
         image_bytes = base64.b64decode(image_base64)
         image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
+        # Process image if necessary (depends on your model)
+        # Assuming your processor handles image preprocessing
+        image_inputs = processor(images=image, return_tensors="pt").to(model.device)
+        # Tokenize the prompt
+        text_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        # Combine image and text inputs if required by your model
+        # This step depends on how your model processes images and text together
+        inputs = {
+            'input_ids': text_inputs['input_ids'],
+            'attention_mask': text_inputs['attention_mask'],
+            # Include image inputs as required
+            # 'pixel_values': image_inputs['pixel_values'],
+        }
         # Generate output
         output_ids = model.generate(**inputs, max_new_tokens=50)
+        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
         # Return the result
         return {'generated_text': generated_text}
     except Exception as e:
         return {'error': str(e)}
+#111