Bagi4 commited on
Commit
f5b860e
·
1 Parent(s): 821eb07
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. main.py +81 -1
  3. requirements.txt +4 -0
Dockerfile CHANGED
@@ -2,4 +2,4 @@ FROM python:3.9
2
 
3
  COPY . .
4
 
5
- CMD ["python", "main.py"]
 
2
 
3
  COPY . .
4
 
5
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py CHANGED
@@ -1 +1,81 @@
1
- print("hello World")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import logging
3
+ from io import BytesIO
4
+ from typing import List
5
+ import clip
6
+ import numpy as np
7
+ import torch
8
+ from PIL import Image
9
+ from fastapi import FastAPI, HTTPException
10
+ from pydantic import BaseModel
11
+
12
+ logging.basicConfig(
13
+ format='%(asctime)s.%(msecs)03d %(levelname)-8s %(message)s',
14
+ level=logging.DEBUG,
15
+ datefmt='%Y-%m-%d %H:%M:%S'
16
+ )
17
+ model, preprocess = clip.load("models/ViT-B-32.pt")
18
+ model.cpu().eval()
19
+
20
+ app = FastAPI()
21
+
22
+
23
+ class Texts(BaseModel):
24
+ texts_source: List[str]
25
+ texts_target: List[str]
26
+
27
+
28
+ class Images(BaseModel):
29
+ images: List[str]
30
+ texts: List[str]
31
+
32
+
33
+ class CLIPResponse(BaseModel):
34
+ similarity: List[List[float]]
35
+
36
+
37
+ @app.post("/clip_image_to_text", response_model=CLIPResponse, tags=["CLIP"])
38
+ def clip_image_to_text(data: Images) -> CLIPResponse:
39
+ preprocessed_images = []
40
+ for image in data.images:
41
+ if 'base64,' not in image:
42
+ raise HTTPException(422, "Image must be in base64")
43
+ image = BytesIO(base64.b64decode(image.split('base64,')[-1]))
44
+ image = Image.open(image)
45
+ preprocessed_images.append(preprocess(image))
46
+
47
+ image_input = torch.tensor(np.stack(preprocessed_images)).cpu()
48
+ text_tokens = clip.tokenize(["This is the " + desc for desc in data.texts]).cpu()
49
+
50
+ with torch.no_grad():
51
+ image_features = model.encode_image(image_input).float()
52
+ text_features = model.encode_text(text_tokens).float()
53
+
54
+ image_features /= image_features.norm(dim=-1, keepdim=True)
55
+ text_features /= text_features.norm(dim=-1, keepdim=True)
56
+ similarity = text_features.cpu().numpy() @ image_features.cpu().numpy().T
57
+ logging.debug(f"Similarity: {similarity}")
58
+ return CLIPResponse(similarity=similarity.tolist())
59
+
60
+
61
+ @app.post("/clip_text_to_text", response_model=CLIPResponse, tags=["CLIP"])
62
+ def clip_text_to_text(data: Texts) -> CLIPResponse:
63
+ text_input = clip.tokenize([f"This is {text}" for text in data.texts_source]).cpu()
64
+ text_output = clip.tokenize(data.texts_target).cpu()
65
+
66
+ with torch.no_grad():
67
+ input_features = model.encode_text(text_input).float()
68
+ output_features = model.encode_text(text_output).float()
69
+
70
+ input_features /= input_features.norm(dim=-1, keepdim=True)
71
+ output_features /= output_features.norm(dim=-1, keepdim=True)
72
+
73
+ similarity = output_features.cpu().numpy() @ input_features.cpu().numpy().T
74
+ logging.debug(f"Similarity: {similarity}")
75
+ return CLIPResponse(similarity=similarity.tolist())
76
+
77
+
78
+ @app.get("/ping", tags=["TEST"])
79
+ def ping():
80
+ return "pong"
81
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi==0.103.0
2
+ uvicorn==0.23.2
3
+ pydantic==2.3.0
4
+ clip @ git+https://github.com/openai/CLIP.git@a1d071733d7111c9c014f024669f959182114e33