Spaces:

taufiqdp
/

phi-3-mini-128k-instruct

Running on Zero

App Files Files Community

taufiqdp commited on Apr 24

Commit

f816b98

•

1 Parent(s): 3d25b03

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +124 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import torch
+import spaces
+import subprocess
+import gradio as gr
+from threading import Thread
+from huggingface_hub import login
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+login(os.environ.get("HF_TOKEN"))
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+model_id = "microsoft/Phi-3-mini-128k-instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    attn_implementation="flash_attention_2"
+)
+@spaces.GPU()
+def generate(
+    message: str,
+    chat_history: list[tuple[str, str]],
+    system_prompt: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: int
+    ):
+    conversation = []
+    if system_prompt:
+        conversation.append({"role": "system", "content": system_prompt})
+    for user, assistant in chat_history:
+        conversation.append({"role": "user", "content": user})
+        conversation.append({"role": "assistant", "content": assistant})
+    conversation.append({"role": "user", "content": message})
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    input_ids, attention_mask = tokenizer.apply_chat_template(
+        conversation,
+        add_generation_prompt=True,
+        return_tensors="pt",
+        return_dict=True
+    ).to(model.device).values()
+    generate_kwargs = dict(
+        {"input_ids": input_ids, "attention_mask": attention_mask},
+        streamer=streamer,
+        do_sample=True,
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
+        top_p=top_p
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for new_token in streamer:
+        outputs.append(new_token)
+        yield "".join(outputs)
+gr.ChatInterface(
+    fn=generate,
+    title="🚀 Phi-3 mini 128k instruct",
+    description="",
+    additional_inputs=[
+        gr.Textbox(
+            label="System prompt",
+            lines=5,
+            value="You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."
+        ),
+        gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=2048,
+            step=1,
+            value=1024,
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0.1,
+            maximum=4.0,
+            step=0.1,
+            value=0.6,
+        ),
+        gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.9,
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.2,
+        ),
+    ],
+    stop_btn=None,
+    examples=[
+        ["Can you provide ways to eat combinations of bananas and dragonfruits?"],
+        ["Write a story about a dragon fruit that flies into outer space!"],
+        ["I am going to Bali, what should I see"],
+    ],
+).queue().launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers==4.40.0
+accelerate==0.29.3
+gradio==4.27.0