llama-2-chat-della

Paused

App Files Files Community

LeoYu commited on Sep 19, 2023

Commit

6c3bb74

1 Parent(s): 5b351de

support 70b

Browse files

Files changed (3) hide show

README.md +8 -0
app.py +25 -15
model.py +61 -59

README.md CHANGED Viewed

@@ -11,6 +11,14 @@ license: other
 suggested_hardware: a10g-small
 ---
 # LLAMA v2 Models
 Llama v2 was introduced in [this paper](https://arxiv.org/abs/2307.09288).

 suggested_hardware: a10g-small
 ---
+# Llama 2 chatbot (Della)
+This is a minimal chatbot built based on https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat
+It is modified because Della does not provide internet access to its compute nodes.
+Below is the original README.
 # LLAMA v2 Models
 Llama v2 was introduced in [this paper](https://arxiv.org/abs/2307.09288).

app.py CHANGED Viewed

@@ -3,7 +3,11 @@ from typing import Iterator
 import gradio as gr
 import torch
-from model import get_input_token_length, run
 DEFAULT_SYSTEM_PROMPT = """\
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
@@ -12,24 +16,30 @@ MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = 4000
-DESCRIPTION = """
-# Llama-2 13B Chat
-This Space demonstrates model [Llama-2-13b-chat](https://huggingface.co/meta-llama/Llama-2-13b-chat) by Meta, a Llama 2 model with 13B parameters fine-tuned for chat instructions. Feel free to play with it, or duplicate to run generations without a queue! If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://huggingface.co/inference-endpoints).
-🔎 For more details about the Llama 2 family of models and how to use them with `transformers`, take a look [at our blog post](https://huggingface.co/blog/llama2).
-🔨 Looking for an even more powerful model? Check out the large [**70B** model demo](https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI).
-🐇 For a smaller model that you can run on many GPUs, check our [7B model demo](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat).
-"""
-LICENSE = """
 <p/>
 ---
-As a derivate work of [Llama-2-13b-chat](https://huggingface.co/meta-llama/Llama-2-13b-chat) by Meta,
-this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/USE_POLICY.md).
 """
 if not torch.cuda.is_available():
@@ -68,7 +78,7 @@ def generate(
         raise ValueError
     history = history_with_input[:-1]
-    generator = run(message, history, system_prompt, max_new_tokens, temperature, top_p, top_k)
     try:
         first_response = next(generator)
         yield history + [(message, first_response)]
@@ -86,7 +96,7 @@ def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
 def check_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> None:
-    input_token_length = get_input_token_length(message, chat_history, system_prompt)
     if input_token_length > MAX_INPUT_TOKEN_LENGTH:
         raise gr.Error(f'The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again.')
@@ -160,7 +170,7 @@ with gr.Blocks(css='style.css') as demo:
         inputs=textbox,
         outputs=[textbox, chatbot],
         fn=process_example,
-        cache_examples=True,
     )
     gr.Markdown(LICENSE)
@@ -277,4 +287,4 @@ with gr.Blocks(css='style.css') as demo:
         api_name=False,
     )
-demo.queue(max_size=20).launch()

 import gradio as gr
 import torch
+from model_any import LlamaModel
+model_id = "meta-llama/Llama-2-70b-chat"
+model_size = "70"
+pipeline = LlamaModel(model_id)
 DEFAULT_SYSTEM_PROMPT = """\
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = 4000
+DESCRIPTION = f"""
+Llama-2 {model_size}B Chat
+{model_id}
+"""
+# DESCRIPTION = """
+# # Llama-2 13B Chat
+# This Space demonstrates model [Llama-2-13b-chat](https://huggingface.co/meta-llama/Llama-2-13b-chat) by Meta, a Llama 2 model with 13B parameters fine-tuned for chat instructions. Feel free to play with it, or duplicate to run generations without a queue! If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://huggingface.co/inference-endpoints).
+# 🔎 For more details about the Llama 2 family of models and how to use them with `transformers`, take a look [at our blog post](https://huggingface.co/blog/llama2).
+# 🔨 Looking for an even more powerful model? Check out the large [**70B** model demo](https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI).
+# 🐇 For a smaller model that you can run on many GPUs, check our [7B model demo](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat).
+# """
+LICENSE = f"""
 <p/>
 ---
+As a derivate work of [Llama-2-{model_size}b-chat](https://huggingface.co/meta-llama/Llama-2-{model_size}b-chat) by Meta,
+this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-{model_size}b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-{model_size}b-chat/blob/main/USE_POLICY.md).
 """
 if not torch.cuda.is_available():
         raise ValueError
     history = history_with_input[:-1]
+    generator = pipeline.run(message, history, system_prompt, max_new_tokens, temperature, top_p, top_k)
     try:
         first_response = next(generator)
         yield history + [(message, first_response)]
 def check_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> None:
+    input_token_length = pipeline.get_input_token_length(message, chat_history, system_prompt)
     if input_token_length > MAX_INPUT_TOKEN_LENGTH:
         raise gr.Error(f'The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again.')
         inputs=textbox,
         outputs=[textbox, chatbot],
         fn=process_example,
+        cache_examples=False,
     )
     gr.Markdown(LICENSE)
         api_name=False,
     )
+demo.queue(max_size=20).launch(share=True)

model.py CHANGED Viewed

@@ -4,71 +4,73 @@ from typing import Iterator
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-model_id = 'meta-llama/Llama-2-13b-chat-hf'
-if torch.cuda.is_available():
-    config = AutoConfig.from_pretrained(model_id)
-    config.pretraining_tp = 1
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        config=config,
-        torch_dtype=torch.float16,
-        load_in_4bit=True,
-        device_map='auto'
-    )
-else:
-    model = None
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-def get_prompt(message: str, chat_history: list[tuple[str, str]],
-               system_prompt: str) -> str:
-    texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
-    # The first user input is _not_ stripped
-    do_strip = False
-    for user_input, response in chat_history:
-        user_input = user_input.strip() if do_strip else user_input
-        do_strip = True
-        texts.append(f'{user_input} [/INST] {response.strip()} </s><s>[INST] ')
-    message = message.strip() if do_strip else message
-    texts.append(f'{message} [/INST]')
-    return ''.join(texts)
-def get_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> int:
-    prompt = get_prompt(message, chat_history, system_prompt)
-    input_ids = tokenizer([prompt], return_tensors='np', add_special_tokens=False)['input_ids']
-    return input_ids.shape[-1]
-def run(message: str,
-        chat_history: list[tuple[str, str]],
-        system_prompt: str,
-        max_new_tokens: int = 1024,
-        temperature: float = 0.8,
-        top_p: float = 0.95,
-        top_k: int = 50) -> Iterator[str]:
-    prompt = get_prompt(message, chat_history, system_prompt)
-    inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
-    streamer = TextIteratorStreamer(tokenizer,
-                                    timeout=10.,
-                                    skip_prompt=True,
-                                    skip_special_tokens=True)
-    generate_kwargs = dict(
-        inputs,
-        streamer=streamer,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        top_p=top_p,
-        top_k=top_k,
-        temperature=temperature,
-        num_beams=1,
-    )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    outputs = []
-    for text in streamer:
-        outputs.append(text)
-        yield ''.join(outputs)

 import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+class LlamaModel:
+    def __init__(self, model_id: str):
+        self.model_id = model_id
+        if torch.cuda.is_available():
+            config = AutoConfig.from_pretrained(model_id)
+            config.pretraining_tp = 1
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                config=config,
+                torch_dtype=torch.float16,
+                load_in_4bit=False,
+                device_map='auto'
+            )
+        else:
+            self.model = None
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+    def get_prompt(self, message: str, chat_history: list[tuple[str, str]],
+                system_prompt: str) -> str:
+        texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
+        # The first user input is _not_ stripped
+        do_strip = False
+        for user_input, response in chat_history:
+            user_input = user_input.strip() if do_strip else user_input
+            do_strip = True
+            texts.append(f'{user_input} [/INST] {response.strip()} </s><s>[INST] ')
+        message = message.strip() if do_strip else message
+        texts.append(f'{message} [/INST]')
+        return ''.join(texts)
+    def get_input_token_length(self, message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> int:
+        prompt = self.get_prompt(message, chat_history, system_prompt)
+        input_ids = self.tokenizer([prompt], return_tensors='np', add_special_tokens=False)['input_ids']
+        return input_ids.shape[-1]
+    def run(self, message: str,
+            chat_history: list[tuple[str, str]],
+            system_prompt: str,
+            max_new_tokens: int = 1024,
+            temperature: float = 0.8,
+            top_p: float = 0.95,
+            top_k: int = 50) -> Iterator[str]:
+        prompt = self.get_prompt(message, chat_history, system_prompt)
+        inputs = self.tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
+        streamer = TextIteratorStreamer(self.tokenizer,
+                                        timeout=10.,
+                                        skip_prompt=True,
+                                        skip_special_tokens=True)
+        generate_kwargs = dict(
+            inputs,
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            top_p=top_p,
+            top_k=top_k,
+            temperature=temperature,
+            num_beams=1,
+        )
+        t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+        t.start()
+        outputs = []
+        for text in streamer:
+            outputs.append(text)
+            yield ''.join(outputs)