File size: 3,108 Bytes
8b8b0b2
 
 
bb35b84
 
8b8b0b2
 
 
 
 
 
bb35b84
 
 
 
8b8b0b2
 
bb35b84
 
 
 
 
 
 
 
 
8b8b0b2
 
25721d0
8b8b0b2
 
 
 
 
 
 
 
 
 
 
bb35b84
 
 
 
 
 
 
 
5b4a85e
bb35b84
 
8b8b0b2
bb35b84
 
8b8b0b2
bb35b84
 
8b8b0b2
bb35b84
8b8b0b2
 
 
1300829
 
8d83783
 
2a2899d
1300829
8b8b0b2
2a2899d
25721d0
4695c36
 
bb35b84
 
a446ae2
bb35b84
 
 
 
 
 
2c96457
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, pipeline
from threading import Thread

# The huggingface model id for Microsoft's phi-2 model
checkpoint = "microsoft/phi-2"

# Download and load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float32, device_map="auto", trust_remote_code=True)

# Streamer
streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True)

# Text generation pipeline
phi2 = pipeline(
    "text-generation", 
    tokenizer=tokenizer, 
    model=model, 
    streamer=streamer, 
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    device_map="auto"
)

# Function that accepts a prompt and generates text using the phi2 pipeline
def generate(prompt, chat_history, max_new_tokens):

  instruction = "You are a helpful assistant to 'User'. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
  final_prompt = f"Instruction: {instruction}\n"

  for sent, received in chat_history:
    final_prompt += "User: " + sent + "\n"
    final_prompt += "Assistant: " + received + "\n"

  final_prompt += "User: " + prompt + "\n"
  final_prompt += "Output:"

  thread = Thread(target=phi2, kwargs={"text_inputs":final_prompt, "max_new_tokens":max_new_tokens})
  thread.start()

  generated_text = ""
  chat_history.append((prompt, ""))
  for word in streamer:
    generated_text += word
    response = generated_text.strip()

    if "User:" in response:
      response = response.split("User:")[0].strip()

    if "Assistant:" in response:
      response = response.split("Assistant:")[1].strip()

    chat_history.pop()
    chat_history.append((prompt, response))

    yield "", chat_history

# Chat interface with gradio
with gr.Blocks() as demo:
  gr.Markdown("""
  # Phi-2 Chatbot Demo
  This chatbot was created using Microsoft's 2.7 billion parameter [phi-2](https://huggingface.co/microsoft/phi-2) Transformer model. 
  
  In order to reduce the response time on this hardware, `max_new_tokens` has been set to `21` in the text generation pipeline. With this default configuration, it takes approximately `60 seconds` for the response to start being generated, and streamed one word at a time. Use the slider below to increase or decrease the length of the generated text.
  """)

  tokens_slider = gr.Slider(8, 128, value=21, label="Maximum new tokens", info="A larger `max_new_tokens` parameter value gives you longer text responses but at the cost of a slower response time.")

  chatbot = gr.Chatbot(label="Phi-2 Chatbot")
  msg = gr.Textbox(label="Message", placeholder="Enter text here")
  with gr.Row():
    with gr.Column():
      btn = gr.Button("Send")
    with gr.Column():
      clear = gr.ClearButton([msg, chatbot])

  btn.click(fn=generate, inputs=[msg, chatbot, tokens_slider], outputs=[msg, chatbot])
  examples = gr.Examples(examples=["Who is Leonhard Euler?"], inputs=[msg])
  
demo.launch()