Added requirements, handler and server.

Browse files

Files changed (3) hide show

handler.py +77 -0
requirements.txt +6 -0
server.py +18 -0

handler.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import requests
+from typing import Dict, Any
+from PIL import Image
+import torch
+import base64
+from io import BytesIO
+from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
+from dotenv import load_dotenv
+import os
+load_dotenv()
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+class EndpointHandler():
+    def __init__(self, path=""):
+        self.processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
+        self.model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)
+        self.model.eval()
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        if os.getenv("SECRET_TOKEN") and os.getenv("SECRET_TOKEN") != data.get("secret_token"):
+            return {"captions": [], "error": "Invalid secret token"}
+        input_data = data.get("inputs", {})
+        input_images = input_data.get("images")
+        if not input_images:
+            return {"captions": [], "error": "No images provided"}
+            # Get list of text arrays (one array per image) containing multiple questions
+        texts_per_image = input_data.get("texts", [[] for _ in input_images])
+        try:
+            raw_images = []
+            for img in input_images:
+                for key in img:
+                    if key == "base64":
+                        raw_images.append(Image.open(BytesIO(base64.b64decode(img[key]))).convert("RGB"))
+                    elif key == "url":
+                        raw_images.append(Image.open(BytesIO(requests.get(img[key]).content)).convert("RGB"))
+                    else:
+                        return {"captions": [], "error": f"Invalid image input: {key}"}
+            # List to store final captions (answers)
+            results = []
+            # Iterate over each image and its corresponding list of questions
+            for image, questions in zip(raw_images, texts_per_image):
+                image_captions = []  # Store answers for each image
+                for question in questions:
+                    print(f"Question: {question}")
+                    # Process the image and question
+                    processed_input = self.processor(image, question, return_tensors="pt").to(device)
+                    # Generate the answer
+                    out = self.model.generate(**processed_input)
+                    # Decode the answer
+                    caption = self.processor.batch_decode(out, skip_special_tokens=True)[0]
+                    # Add the answer to the list for the current image
+                    image_captions.append({"answer": caption})
+                # Store results for the current image
+                results.append({"image_results": image_captions})
+            return {"captions": results}
+        except Exception as e:
+            print(f"Error during processing: {str(e)}")
+            return {"captions": [], "error": str(e)}

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+requests
+Pillow
+torch
+transformers
+flask
+python-dotenv

server.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from flask import Flask, request, jsonify
+from handler import EndpointHandler  # Import the class from handler.py
+app = Flask(__name__)
+handler = EndpointHandler()
+@app.route("/", methods=["POST"])
+def generate_captions():
+    try:
+        data = request.json
+        output = handler(data)
+        return jsonify(output)
+    except Exception as e:
+        return jsonify({"error": str(e)}), 400
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=5000)