sagar007 commited on
Commit
5904b1d
·
verified ·
1 Parent(s): 98376b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -18
app.py CHANGED
@@ -1,13 +1,9 @@
1
  import os
2
  import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, TextIteratorStreamer, BitsAndBytesConfig
4
  import gradio as gr
5
  from threading import Thread
6
  from PIL import Image
7
- import subprocess
8
-
9
- # Install flash-attention
10
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
11
 
12
  # Constants
13
  TITLE = "<h1><center>Phi 3.5 Multimodal (Text + Vision)</center></h1>"
@@ -19,28 +15,23 @@ VISION_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
19
 
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
21
 
22
- # Quantization config for text model
23
- quantization_config = BitsAndBytesConfig(
24
- load_in_4bit=True,
25
- bnb_4bit_compute_dtype=torch.bfloat16,
26
- bnb_4bit_use_double_quant=True,
27
- bnb_4bit_quant_type="nf4"
28
- )
29
-
30
  # Load models and tokenizers
31
  text_tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_ID)
32
  text_model = AutoModelForCausalLM.from_pretrained(
33
  TEXT_MODEL_ID,
34
- torch_dtype=torch.bfloat16,
35
- device_map="auto",
36
- quantization_config=quantization_config
37
  )
38
 
 
 
 
39
  vision_model = AutoModelForCausalLM.from_pretrained(
40
  VISION_MODEL_ID,
41
  trust_remote_code=True,
42
- torch_dtype="auto",
43
- attn_implementation="flash_attention_2"
44
  ).to(device).eval()
45
 
46
  vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID, trust_remote_code=True)
@@ -125,4 +116,5 @@ with gr.Blocks() as demo:
125
  vision_submit_btn.click(process_vision_query, [vision_input_img, vision_text_input], [vision_output_text])
126
 
127
  if __name__ == "__main__":
 
128
  demo.launch()
 
1
  import os
2
  import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, TextIteratorStreamer
4
  import gradio as gr
5
  from threading import Thread
6
  from PIL import Image
 
 
 
 
7
 
8
  # Constants
9
  TITLE = "<h1><center>Phi 3.5 Multimodal (Text + Vision)</center></h1>"
 
15
 
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
 
 
 
 
 
 
 
 
 
18
  # Load models and tokenizers
19
  text_tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_ID)
20
  text_model = AutoModelForCausalLM.from_pretrained(
21
  TEXT_MODEL_ID,
22
+ torch_dtype=torch.float32 if device == "cpu" else torch.float16,
23
+ device_map="auto" if device == "cuda" else None,
24
+ low_cpu_mem_usage=True
25
  )
26
 
27
+ if device == "cuda":
28
+ text_model = text_model.half() # Convert to half precision if on GPU
29
+
30
  vision_model = AutoModelForCausalLM.from_pretrained(
31
  VISION_MODEL_ID,
32
  trust_remote_code=True,
33
+ torch_dtype=torch.float32 if device == "cpu" else torch.float16,
34
+ low_cpu_mem_usage=True
35
  ).to(device).eval()
36
 
37
  vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID, trust_remote_code=True)
 
116
  vision_submit_btn.click(process_vision_query, [vision_input_img, vision_text_input], [vision_output_text])
117
 
118
  if __name__ == "__main__":
119
+ print(f"Running on device: {device}")
120
  demo.launch()