Tonic commited on
Commit
f570b2f
·
unverified ·
1 Parent(s): 9a64677

add snapshot download

Browse files
Files changed (1) hide show
  1. app.py +26 -17
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import torch
2
  import torch.nn as nn
3
  import torch.nn.functional as F
@@ -6,20 +7,20 @@ import json
6
  import gradio as gr
7
  from PIL import Image
8
  import numpy as np
 
9
  from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
10
  from mistral_common.protocol.instruct.request import ChatCompletionRequest
11
  from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
12
- from huggingface_hub import snapshot_download
13
- import Spaces
14
-
15
 
16
  # Download model files
17
  model_path = snapshot_download(repo_id="mistral-community/pixtral-12b-240910")
18
 
19
- with open('PARAMS.json', 'r') as f:
 
20
  params = json.load(f)
21
 
22
- with open('TEKKEN.json', 'r') as f:
23
  tokenizer_config = json.load(f)
24
 
25
  class GELU(nn.Module):
@@ -28,6 +29,7 @@ class GELU(nn.Module):
28
  self.linear = nn.Linear(dim_in, dim_out, bias=bias)
29
  self.approximate = approximate
30
 
 
31
  def forward(self, x):
32
  if self.approximate == 'tanh':
33
  return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
@@ -46,6 +48,7 @@ class Rope2D(nn.Module):
46
  self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
47
  self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
48
 
 
49
  def forward(self, x, seq_len=None):
50
  if seq_len > self.max_seq_len_cached:
51
  self.max_seq_len_cached = seq_len
@@ -69,6 +72,7 @@ class VisionEncoder(nn.Module):
69
  self.norm = nn.LayerNorm(config['hidden_size'])
70
  self.gelu = GELU(config['hidden_size'], config['hidden_size'])
71
 
 
72
  def forward(self, pixel_values):
73
  x = self.embed(pixel_values)
74
  b, c, h, w = x.shape
@@ -86,30 +90,34 @@ class PixtralModel(nn.Module):
86
  self.vision_encoder = VisionEncoder(params['vision_encoder'])
87
  # Add text generation components here
88
 
 
89
  def forward(self, image):
90
  vision_output = self.vision_encoder(image)
91
  # Add text generation logic here
92
  return vision_output
93
 
94
- # Initialize the model
95
- model = PixtralModel(params)
96
-
97
- # Load the model weights
98
- with safe_open('consolidated.safetensors', framework="pt", device="cpu") as f:
99
- for name, param in model.named_parameters():
100
- if name in f.keys():
101
- param.data = f.get_tensor(name)
102
-
103
- model.eval()
 
104
 
105
- # Initialize the tokenizer
106
  tokenizer = MistralTokenizer.from_model("pixtral")
107
 
 
108
  def process_image_and_text(image, prompt):
109
  # Prepare the image
110
  image = image.convert('RGB')
111
  image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
112
  image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
 
113
 
114
  # Tokenize the input
115
  tokenized = tokenizer.encode_chat_completion(
@@ -169,4 +177,5 @@ with gr.Blocks() as demo:
169
  gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
170
  gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
171
 
172
- demo.launch()
 
 
1
+
2
  import torch
3
  import torch.nn as nn
4
  import torch.nn.functional as F
 
7
  import gradio as gr
8
  from PIL import Image
9
  import numpy as np
10
+ from huggingface_hub import snapshot_download
11
  from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
12
  from mistral_common.protocol.instruct.request import ChatCompletionRequest
13
  from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
14
+ import spaces
 
 
15
 
16
  # Download model files
17
  model_path = snapshot_download(repo_id="mistral-community/pixtral-12b-240910")
18
 
19
+ # Load model parameters and tokenizer configuration
20
+ with open(f'{model_path}/params.json', 'r') as f:
21
  params = json.load(f)
22
 
23
+ with open(f'{model_path}/tekken.json', 'r') as f:
24
  tokenizer_config = json.load(f)
25
 
26
  class GELU(nn.Module):
 
29
  self.linear = nn.Linear(dim_in, dim_out, bias=bias)
30
  self.approximate = approximate
31
 
32
+ @spaces.GPU
33
  def forward(self, x):
34
  if self.approximate == 'tanh':
35
  return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
48
  self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
49
  self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
50
 
51
+ @spaces.GPU
52
  def forward(self, x, seq_len=None):
53
  if seq_len > self.max_seq_len_cached:
54
  self.max_seq_len_cached = seq_len
 
72
  self.norm = nn.LayerNorm(config['hidden_size'])
73
  self.gelu = GELU(config['hidden_size'], config['hidden_size'])
74
 
75
+ @spaces.GPU
76
  def forward(self, pixel_values):
77
  x = self.embed(pixel_values)
78
  b, c, h, w = x.shape
 
90
  self.vision_encoder = VisionEncoder(params['vision_encoder'])
91
  # Add text generation components here
92
 
93
+ @spaces.GPU
94
  def forward(self, image):
95
  vision_output = self.vision_encoder(image)
96
  # Add text generation logic here
97
  return vision_output
98
 
99
+ @spaces.GPU
100
+ def load_model(params, model_path):
101
+ model = PixtralModel(params)
102
+
103
+ with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cuda") as f:
104
+ for name, param in model.named_parameters():
105
+ if name in f.keys():
106
+ param.data = f.get_tensor(name)
107
+
108
+ model.eval()
109
+ return model.cuda()
110
 
111
+ model = load_model(params, model_path)
112
  tokenizer = MistralTokenizer.from_model("pixtral")
113
 
114
+ @spaces.GPU
115
  def process_image_and_text(image, prompt):
116
  # Prepare the image
117
  image = image.convert('RGB')
118
  image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
119
  image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
120
+ image_tensor = image_tensor.cuda()
121
 
122
  # Tokenize the input
123
  tokenized = tokenizer.encode_chat_completion(
 
177
  gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
178
  gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
179
 
180
+ if __name__ == "__main__":
181
+ demo.launch()