MaziyarPanahi commited on
Commit
79b926d
·
verified ·
1 Parent(s): 7d910e3
Files changed (1) hide show
  1. app.py +37 -38
app.py CHANGED
@@ -11,11 +11,15 @@ from transformers import TextIteratorStreamer
11
  import subprocess
12
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
13
 
 
14
  PLACEHOLDER = """
15
  <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
16
- <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">microsoft/Phi-3-vision-128k-instruct</h1>
 
 
17
  </div>
18
  """
 
19
  user_prompt = '<|user|>\n'
20
  assistant_prompt = '<|assistant|>\n'
21
  prompt_suffix = "<|end|>\n"
@@ -37,7 +41,8 @@ model.to("cuda:0")
37
 
38
  @spaces.GPU
39
  def bot_streaming(message, history):
40
- print(message)
 
41
  if message["files"]:
42
  # message["files"][-1] is a Dict or just a string
43
  if type(message["files"][-1]) == dict:
@@ -53,54 +58,48 @@ def bot_streaming(message, history):
53
  try:
54
  if image is None:
55
  # Handle the case where image is None
56
- gr.Error("You need to upload an image for Phi-3-vision to work.")
57
  except NameError:
58
  # Handle the case where 'image' is not defined at all
59
- gr.Error("You need to upload an image for Phi-3-vision to work.")
60
-
61
- # prompt = f"{message['text']}<|image_1|>\nCan you convert the table to markdown format?{prompt_suffix}{assistant_prompt}"
62
- chat = [
63
- {"role": "user", "content": f"<|image_1|>\n{message['text']}"},
64
- ]
65
- prompt = processor.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
66
-
67
- # need to remove last <|endoftext|> if it is there, which is used for training, not inference. For training, make sure to add <|endoftext|> in the end.
68
- if prompt.endswith("<|endoftext|>"):
69
- prompt = prompt.rstrip("<|endoftext|>")
70
-
71
- print(f">>> Prompt\n{prompt})")
72
-
 
 
 
 
 
 
 
 
 
73
  image = Image.open(image)
74
- inputs = processor(prompt, [image], return_tensors='pt').to("cuda:0")
75
-
76
- streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": False, "skip_prompt": True})
77
- generation_kwargs = dict(
78
- inputs,
79
- streamer=streamer,
80
- max_new_tokens=1024,
81
- do_sample=False,
82
- temperature=0.0,
83
- eos_token_id=processor.tokenizer.eos_token_id
84
- )
85
 
86
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
87
  thread.start()
88
 
89
  buffer = ""
90
- time.sleep(0.5)
91
  for new_text in streamer:
92
- # if "<|endoftext|>" in new_text:
93
- # break
94
  buffer += new_text
95
-
96
- generated_text_without_prompt = buffer
97
- # print(generated_text_without_prompt)
98
- time.sleep(0.06)
99
- # print(f"new_text: {generated_text_without_prompt}")
100
- yield generated_text_without_prompt
101
 
102
 
103
- chatbot = gr.Chatbot(placeholder=PLACEHOLDER, scale=1, height=550)
104
  chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...",
105
  show_label=False)
106
  with gr.Blocks(fill_height=True, ) as demo:
 
11
  import subprocess
12
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
13
 
14
+ # thanks to https://huggingface.co/ysharma
15
  PLACEHOLDER = """
16
  <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
17
+ <img src="https://cdn-thumbnails.huggingface.co/social-thumbnails/models/microsoft/Phi-3-vision-128k-instruct.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55; ">
18
+ <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Microsoft's Phi3-Vision-128k-Context</h1>
19
+ <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Phi-3-Vision is a 4.2B parameter multimodal model that brings together language and vision capabilities.</p>
20
  </div>
21
  """
22
+
23
  user_prompt = '<|user|>\n'
24
  assistant_prompt = '<|assistant|>\n'
25
  prompt_suffix = "<|end|>\n"
 
41
 
42
  @spaces.GPU
43
  def bot_streaming(message, history):
44
+ print(f'message is - {message}')
45
+ print(f'history is - {history}')
46
  if message["files"]:
47
  # message["files"][-1] is a Dict or just a string
48
  if type(message["files"][-1]) == dict:
 
58
  try:
59
  if image is None:
60
  # Handle the case where image is None
61
+ raise gr.Error("You need to upload an image for Phi3-Vision to work. Close the error and try again with an Image.")
62
  except NameError:
63
  # Handle the case where 'image' is not defined at all
64
+ raise gr.Error("You need to upload an image for Phi3-Vision to work. Close the error and try again with an Image.")
65
+
66
+ conversation = []
67
+ flag=False
68
+ for user, assistant in history:
69
+ if assistant is None:
70
+ #pass
71
+ flag=True
72
+ conversation.extend([{"role": "user", "content":""}])
73
+ continue
74
+ if flag==True:
75
+ conversation[0]['content'] = f"<|image_1|>\n{user}"
76
+ conversation.extend([{"role": "assistant", "content": assistant}])
77
+ flag=False
78
+ continue
79
+ conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
80
+
81
+ if len(history) == 0:
82
+ conversation.append({"role": "user", "content": f"<|image_1|>\n{message['text']}"})
83
+ else:
84
+ conversation.append({"role": "user", "content": message['text']})
85
+ print(f"prompt is -\n{conversation}")
86
+ prompt = processor.tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
87
  image = Image.open(image)
88
+ inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
89
+
90
+ streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": True, "skip_prompt": True, 'clean_up_tokenization_spaces':False,})
91
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, do_sample=False, temperature=0.0, eos_token_id=processor.tokenizer.eos_token_id,)
 
 
 
 
 
 
 
92
 
93
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
94
  thread.start()
95
 
96
  buffer = ""
 
97
  for new_text in streamer:
 
 
98
  buffer += new_text
99
+ yield buffer
 
 
 
 
 
100
 
101
 
102
+ chatbot = gr.Chatbot(placeholder=PLACEHOLDER, scale=1)
103
  chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...",
104
  show_label=False)
105
  with gr.Blocks(fill_height=True, ) as demo: