dwb2023's picture
Update app.py
9321666 verified
raw
history blame
4.53 kB
import os
from threading import Thread
from typing import Iterator
import gradio as gr
import spaces
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TextIteratorStreamer,
LlamaTokenizer,
)
MAX_MAX_NEW_TOKENS = 1024
DEFAULT_MAX_NEW_TOKENS = 256
MAX_INPUT_TOKEN_LENGTH = 512
DESCRIPTION = """\
# OpenELM-270M-Instruct -- Running on CPU
This Space demonstrates [apple/OpenELM-270M-Instruct](https://huggingface.co/apple/OpenELM-270M-Instruct) by Apple. Please, check the original model card for details.
For detail on the OpenELM model refer to the Paper page [here](https://huggingface.co/papers/2404.14619)
For detail on the pre-training, instruct tuning, and parameter-efficient finetuning process refer to the [OpenELM page in the CoreNet GitHub repository](https://github.com/apple/corenet/tree/main/projects/openelm)
"""
LICENSE = """
<p/>
---
As a derivative work of [apple/OpenELM-270M-Instruct](https://huggingface.co/apple/OpenELM-270M-Instruct) by Apple,
this demo is governed by the original [license](https://huggingface.co/apple/OpenELM-270M-Instruct/blob/main/LICENSE)
Based on the [Norod78/OpenELM_3B_Demo](https://huggingface.co/spaces/Norod78/OpenELM_3B_Demo) space.
"""
model = AutoModelForCausalLM.from_pretrained(
"apple/OpenELM-270M-Instruct",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(
"NousResearch/Llama-2-7b-hf",
trust_remote_code=True,
tokenizer_class=LlamaTokenizer,
)
if tokenizer.pad_token == None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.eos_token_id
def generate(
message: str,
chat_history: list[tuple[str, str]],
max_new_tokens: int = 1024,
temperature: float = 0.1,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.4,
) -> Iterator[str]:
historical_text = ""
#Prepend the entire chat history to the message with new lines between each message
for user, assistant in chat_history:
historical_text += f"\n{user}\n{assistant}"
if len(historical_text) > 0:
message = historical_text + f"\n{message}"
input_ids = tokenizer([message], return_tensors="pt").input_ids
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
input_ids = input_ids.to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
{"input_ids": input_ids},
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
num_beams=1,
pad_token_id = tokenizer.eos_token_id,
repetition_penalty=repetition_penalty,
no_repeat_ngram_size=5,
early_stopping=False,
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
yield "".join(outputs)
chat_interface = gr.ChatInterface(
fn=generate,
additional_inputs=[
gr.Slider(
label="Max new tokens",
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
step=1,
value=DEFAULT_MAX_NEW_TOKENS,
),
gr.Slider(
label="Temperature",
minimum=0.1,
maximum=4.0,
step=0.1,
value=0.6,
),
gr.Slider(
label="Top-p (nucleus sampling)",
minimum=0.05,
maximum=1.0,
step=0.05,
value=0.9,
),
gr.Slider(
label="Top-k",
minimum=1,
maximum=1000,
step=1,
value=50,
),
gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=2.0,
step=0.05,
value=1.4,
),
],
stop_btn=None,
examples=[
["Explain quantum physics in 5 words or less:"],
["Question: What do you call a bear with no teeth?\nAnswer:"],
],
)
with gr.Blocks(css="style.css") as demo:
gr.Markdown(DESCRIPTION)
chat_interface.render()
gr.Markdown(LICENSE)
if __name__ == "__main__":
demo.queue(max_size=20).launch()