Tonic commited on
Commit
e562e7a
·
unverified ·
1 Parent(s): f570b2f

add snapshot download

Browse files
Files changed (1) hide show
  1. app.py +7 -8
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import torch
3
  import torch.nn as nn
4
  import torch.nn.functional as F
@@ -29,7 +28,6 @@ class GELU(nn.Module):
29
  self.linear = nn.Linear(dim_in, dim_out, bias=bias)
30
  self.approximate = approximate
31
 
32
- @spaces.GPU
33
  def forward(self, x):
34
  if self.approximate == 'tanh':
35
  return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
@@ -48,7 +46,6 @@ class Rope2D(nn.Module):
48
  self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
49
  self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
50
 
51
- @spaces.GPU
52
  def forward(self, x, seq_len=None):
53
  if seq_len > self.max_seq_len_cached:
54
  self.max_seq_len_cached = seq_len
@@ -72,7 +69,6 @@ class VisionEncoder(nn.Module):
72
  self.norm = nn.LayerNorm(config['hidden_size'])
73
  self.gelu = GELU(config['hidden_size'], config['hidden_size'])
74
 
75
- @spaces.GPU
76
  def forward(self, pixel_values):
77
  x = self.embed(pixel_values)
78
  b, c, h, w = x.shape
@@ -90,25 +86,26 @@ class PixtralModel(nn.Module):
90
  self.vision_encoder = VisionEncoder(params['vision_encoder'])
91
  # Add text generation components here
92
 
93
- @spaces.GPU
94
  def forward(self, image):
95
  vision_output = self.vision_encoder(image)
96
  # Add text generation logic here
97
  return vision_output
98
 
99
- @spaces.GPU
100
  def load_model(params, model_path):
101
  model = PixtralModel(params)
102
 
103
- with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cuda") as f:
104
  for name, param in model.named_parameters():
105
  if name in f.keys():
106
  param.data = f.get_tensor(name)
107
 
108
  model.eval()
109
- return model.cuda()
110
 
 
111
  model = load_model(params, model_path)
 
 
112
  tokenizer = MistralTokenizer.from_model("pixtral")
113
 
114
  @spaces.GPU
@@ -137,7 +134,9 @@ def process_image_and_text(image, prompt):
137
 
138
  # Process the image and generate text
139
  with torch.no_grad():
 
140
  vision_output = model(image_tensor)
 
141
  # Add text generation logic here
142
  generated_text = f"Generated text based on the image and prompt: {prompt}"
143
 
 
 
1
  import torch
2
  import torch.nn as nn
3
  import torch.nn.functional as F
 
28
  self.linear = nn.Linear(dim_in, dim_out, bias=bias)
29
  self.approximate = approximate
30
 
 
31
  def forward(self, x):
32
  if self.approximate == 'tanh':
33
  return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
46
  self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
47
  self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
48
 
 
49
  def forward(self, x, seq_len=None):
50
  if seq_len > self.max_seq_len_cached:
51
  self.max_seq_len_cached = seq_len
 
69
  self.norm = nn.LayerNorm(config['hidden_size'])
70
  self.gelu = GELU(config['hidden_size'], config['hidden_size'])
71
 
 
72
  def forward(self, pixel_values):
73
  x = self.embed(pixel_values)
74
  b, c, h, w = x.shape
 
86
  self.vision_encoder = VisionEncoder(params['vision_encoder'])
87
  # Add text generation components here
88
 
 
89
  def forward(self, image):
90
  vision_output = self.vision_encoder(image)
91
  # Add text generation logic here
92
  return vision_output
93
 
 
94
  def load_model(params, model_path):
95
  model = PixtralModel(params)
96
 
97
+ with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cpu") as f:
98
  for name, param in model.named_parameters():
99
  if name in f.keys():
100
  param.data = f.get_tensor(name)
101
 
102
  model.eval()
103
+ return model
104
 
105
+ # Initialize the model
106
  model = load_model(params, model_path)
107
+
108
+ # Initialize the tokenizer
109
  tokenizer = MistralTokenizer.from_model("pixtral")
110
 
111
  @spaces.GPU
 
134
 
135
  # Process the image and generate text
136
  with torch.no_grad():
137
+ model.cuda() # Move model to GPU only when processing
138
  vision_output = model(image_tensor)
139
+ model.cpu() # Move model back to CPU after processing
140
  # Add text generation logic here
141
  generated_text = f"Generated text based on the image and prompt: {prompt}"
142