Spaces:
Paused
Paused
add snapshot download
Browse files
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
import torch
|
3 |
import torch.nn as nn
|
4 |
import torch.nn.functional as F
|
@@ -29,7 +28,6 @@ class GELU(nn.Module):
|
|
29 |
self.linear = nn.Linear(dim_in, dim_out, bias=bias)
|
30 |
self.approximate = approximate
|
31 |
|
32 |
-
@spaces.GPU
|
33 |
def forward(self, x):
|
34 |
if self.approximate == 'tanh':
|
35 |
return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
@@ -48,7 +46,6 @@ class Rope2D(nn.Module):
|
|
48 |
self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
|
49 |
self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
|
50 |
|
51 |
-
@spaces.GPU
|
52 |
def forward(self, x, seq_len=None):
|
53 |
if seq_len > self.max_seq_len_cached:
|
54 |
self.max_seq_len_cached = seq_len
|
@@ -72,7 +69,6 @@ class VisionEncoder(nn.Module):
|
|
72 |
self.norm = nn.LayerNorm(config['hidden_size'])
|
73 |
self.gelu = GELU(config['hidden_size'], config['hidden_size'])
|
74 |
|
75 |
-
@spaces.GPU
|
76 |
def forward(self, pixel_values):
|
77 |
x = self.embed(pixel_values)
|
78 |
b, c, h, w = x.shape
|
@@ -90,25 +86,26 @@ class PixtralModel(nn.Module):
|
|
90 |
self.vision_encoder = VisionEncoder(params['vision_encoder'])
|
91 |
# Add text generation components here
|
92 |
|
93 |
-
@spaces.GPU
|
94 |
def forward(self, image):
|
95 |
vision_output = self.vision_encoder(image)
|
96 |
# Add text generation logic here
|
97 |
return vision_output
|
98 |
|
99 |
-
@spaces.GPU
|
100 |
def load_model(params, model_path):
|
101 |
model = PixtralModel(params)
|
102 |
|
103 |
-
with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="
|
104 |
for name, param in model.named_parameters():
|
105 |
if name in f.keys():
|
106 |
param.data = f.get_tensor(name)
|
107 |
|
108 |
model.eval()
|
109 |
-
return model
|
110 |
|
|
|
111 |
model = load_model(params, model_path)
|
|
|
|
|
112 |
tokenizer = MistralTokenizer.from_model("pixtral")
|
113 |
|
114 |
@spaces.GPU
|
@@ -137,7 +134,9 @@ def process_image_and_text(image, prompt):
|
|
137 |
|
138 |
# Process the image and generate text
|
139 |
with torch.no_grad():
|
|
|
140 |
vision_output = model(image_tensor)
|
|
|
141 |
# Add text generation logic here
|
142 |
generated_text = f"Generated text based on the image and prompt: {prompt}"
|
143 |
|
|
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
import torch.nn.functional as F
|
|
|
28 |
self.linear = nn.Linear(dim_in, dim_out, bias=bias)
|
29 |
self.approximate = approximate
|
30 |
|
|
|
31 |
def forward(self, x):
|
32 |
if self.approximate == 'tanh':
|
33 |
return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
|
|
46 |
self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
|
47 |
self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
|
48 |
|
|
|
49 |
def forward(self, x, seq_len=None):
|
50 |
if seq_len > self.max_seq_len_cached:
|
51 |
self.max_seq_len_cached = seq_len
|
|
|
69 |
self.norm = nn.LayerNorm(config['hidden_size'])
|
70 |
self.gelu = GELU(config['hidden_size'], config['hidden_size'])
|
71 |
|
|
|
72 |
def forward(self, pixel_values):
|
73 |
x = self.embed(pixel_values)
|
74 |
b, c, h, w = x.shape
|
|
|
86 |
self.vision_encoder = VisionEncoder(params['vision_encoder'])
|
87 |
# Add text generation components here
|
88 |
|
|
|
89 |
def forward(self, image):
|
90 |
vision_output = self.vision_encoder(image)
|
91 |
# Add text generation logic here
|
92 |
return vision_output
|
93 |
|
|
|
94 |
def load_model(params, model_path):
|
95 |
model = PixtralModel(params)
|
96 |
|
97 |
+
with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cpu") as f:
|
98 |
for name, param in model.named_parameters():
|
99 |
if name in f.keys():
|
100 |
param.data = f.get_tensor(name)
|
101 |
|
102 |
model.eval()
|
103 |
+
return model
|
104 |
|
105 |
+
# Initialize the model
|
106 |
model = load_model(params, model_path)
|
107 |
+
|
108 |
+
# Initialize the tokenizer
|
109 |
tokenizer = MistralTokenizer.from_model("pixtral")
|
110 |
|
111 |
@spaces.GPU
|
|
|
134 |
|
135 |
# Process the image and generate text
|
136 |
with torch.no_grad():
|
137 |
+
model.cuda() # Move model to GPU only when processing
|
138 |
vision_output = model(image_tensor)
|
139 |
+
model.cpu() # Move model back to CPU after processing
|
140 |
# Add text generation logic here
|
141 |
generated_text = f"Generated text based on the image and prompt: {prompt}"
|
142 |
|