sagar007 commited on
Commit
2796a5e
·
verified ·
1 Parent(s): ab8bcac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -8
app.py CHANGED
@@ -1,9 +1,13 @@
1
  import os
2
  import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, TextIteratorStreamer
4
  import gradio as gr
5
  from threading import Thread
6
  from PIL import Image
 
 
 
 
7
 
8
  # Constants
9
  TITLE = "<h1><center>Phi 3.5 Multimodal (Text + Vision)</center></h1>"
@@ -14,23 +18,29 @@ TEXT_MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
14
  VISION_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
15
 
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
- print(f"Using device: {device}")
 
 
 
 
 
 
 
18
 
19
  # Load models and tokenizers
20
  text_tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_ID)
21
  text_model = AutoModelForCausalLM.from_pretrained(
22
  TEXT_MODEL_ID,
23
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
24
  device_map="auto",
25
- low_cpu_mem_usage=True
26
  )
27
 
28
  vision_model = AutoModelForCausalLM.from_pretrained(
29
  VISION_MODEL_ID,
30
  trust_remote_code=True,
31
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
32
- attn_implementation="flash_attention_2" if device == "cuda" else None,
33
- low_cpu_mem_usage=True
34
  ).to(device).eval()
35
 
36
  vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID, trust_remote_code=True)
@@ -45,7 +55,7 @@ def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_t
45
  ])
46
  conversation.append({"role": "user", "content": message})
47
 
48
- input_ids = text_tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(device)
49
  streamer = TextIteratorStreamer(text_tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
50
 
51
  generate_kwargs = dict(
 
1
  import os
2
  import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, TextIteratorStreamer, BitsAndBytesConfig
4
  import gradio as gr
5
  from threading import Thread
6
  from PIL import Image
7
+ import subprocess
8
+
9
+ # Install flash-attention
10
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
11
 
12
  # Constants
13
  TITLE = "<h1><center>Phi 3.5 Multimodal (Text + Vision)</center></h1>"
 
18
  VISION_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
19
 
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
21
+
22
+ # Quantization config for text model
23
+ quantization_config = BitsAndBytesConfig(
24
+ load_in_4bit=True,
25
+ bnb_4bit_compute_dtype=torch.bfloat16,
26
+ bnb_4bit_use_double_quant=True,
27
+ bnb_4bit_quant_type="nf4"
28
+ )
29
 
30
  # Load models and tokenizers
31
  text_tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_ID)
32
  text_model = AutoModelForCausalLM.from_pretrained(
33
  TEXT_MODEL_ID,
34
+ torch_dtype=torch.bfloat16,
35
  device_map="auto",
36
+ quantization_config=quantization_config
37
  )
38
 
39
  vision_model = AutoModelForCausalLM.from_pretrained(
40
  VISION_MODEL_ID,
41
  trust_remote_code=True,
42
+ torch_dtype="auto",
43
+ attn_implementation="flash_attention_2"
 
44
  ).to(device).eval()
45
 
46
  vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID, trust_remote_code=True)
 
55
  ])
56
  conversation.append({"role": "user", "content": message})
57
 
58
+ input_ids = text_tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(text_model.device)
59
  streamer = TextIteratorStreamer(text_tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
60
 
61
  generate_kwargs = dict(