Spaces:

MarcdeFalco
/

mathstral_test

Sleeping

App Files Files Community

MarcdeFalco commited on Jul 27, 2024

Commit

ce78f1b

verified ·

1 Parent(s): 6f755ec

Migrate to transformers

Browse files

Files changed (1) hide show

app.py +67 -81

app.py CHANGED Viewed

@@ -1,83 +1,55 @@
-from huggingface_hub import InferenceClient
 import gradio as gr
 import os
-API_URL = {
-    "Mistral" : "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3",
-    "Mixtral" : "https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "Mathstral" : "https://api-inference.huggingface.co/models/mistralai/mathstral-7B-v0.1",
-}
-HF_TOKEN = os.environ['HF_TOKEN']
-mistralClient = InferenceClient(
-    API_URL["Mistral"],
-    headers = {"Authorization" : f"Bearer {HF_TOKEN}"},
-)
-mixtralClient = InferenceClient(
-    model = API_URL["Mixtral"],
-    headers = {"Authorization" : f"Bearer {HF_TOKEN}"},
-)
-mathstralClient = InferenceClient(
-    model = API_URL["Mathstral"],
-    headers = {"Authorization" : f"Bearer {HF_TOKEN}"},
-)
 def format_prompt(message, history):
-  prompt = "<s>"
   for user_prompt, bot_response in history:
     prompt += f"[INST] {user_prompt} [/INST]"
-    prompt += f" {bot_response}</s> "
   prompt += f"[INST] {message} [/INST]"
   return prompt
-def generate(prompt, history, temperature=0.9, max_new_tokens=256, top_p=0.95,
-             repetition_penalty=1.0, model = "Mathstral"):
-    # Selecting model to be used
-    if(model == "Mistral"):
-      client = mistralClient
-    elif(model == "Mixstral"):
-      client = mixtralClient
-    elif(model == "Mathstral"):
-      client = mathstralClient
-    temperature = float(temperature) # Generation arguments
-    if temperature < 1e-2:
-        temperature = 1e-2
-    top_p = float(top_p)
     generate_kwargs = dict(
-        temperature=temperature,
-        max_new_tokens=max_new_tokens,
-        top_p=top_p,
-        repetition_penalty=repetition_penalty,
-        do_sample=True,
-        seed=42,
     )
-    formatted_prompt = format_prompt(prompt, history)
-    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
-    output = ""
-    for response in stream:
-        output += response.token.text
-        yield output
-    return output
 additional_inputs=[
-    gr.Slider(
-        label="Temperature",
-        value=0.3,
-        minimum=0.0,
-        maximum=1.0,
-        step=0.1,
-        interactive=True,
-        info="Higher values produce more diverse outputs",
-    ),
     gr.Slider(
         label="Max new tokens",
         value=1024,
@@ -87,15 +59,6 @@ additional_inputs=[
         interactive=True,
         info="The maximum numbers of new tokens",
     ),
-    gr.Slider(
-        label="Top-p (nucleus sampling)",
-        value=0.90,
-        minimum=0.0,
-        maximum=1,
-        step=0.05,
-        interactive=True,
-        info="Higher values sample more low-probability tokens",
-    ),
     gr.Slider(
         label="Repetition penalty",
         value=1.2,
@@ -105,15 +68,6 @@ additional_inputs=[
         interactive=True,
         info="Penalize repeated tokens",
     ),
-        gr.Dropdown(
-        choices = ["Mistral","Mixtral", "Mathstral"],
-        value = "Mathstral",
-        label = "Le modèle à utiliser",
-        interactive=True,
-        info = "Mistral : pour des conversations génériques, "+
-               "Mixtral : conversations plus rapides et plus performantes, "+
-               "Mathstral : raisonnement mathématiques et scientifique"
-    ),
 ]
 css = """
@@ -144,3 +98,35 @@ with gr.Blocks(css=css) as demo:
     )
 demo.queue(max_size=100).launch(debug=True)

+import spaces
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+from threading import Thread
 import gradio as gr
+import torch
 import os
+device = "cuda"
+model_name = "mistralai/mathstral-7B-v0.1"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name,
+                torch_dtype=torch.float16).to(device)
+HF_TOKEN = os.environ['HF_TOKEN']
 def format_prompt(message, history):
+  prompt = ""
   for user_prompt, bot_response in history:
     prompt += f"[INST] {user_prompt} [/INST]"
+    prompt += f" {bot_response} "
   prompt += f"[INST] {message} [/INST]"
   return prompt
+@spaces.GPU
+def generate(prompt, history,
+             max_new_tokens=1024,
+             repetition_penalty=1.2):
+    formatted_prompt = format_prompt(prompt, history)
+    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(tokenizer)
     generate_kwargs = dict(
+            inputs,
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
     )
+    thread = Thread(target=model.generate, kwargs=generate_kwargs)
+    thread.start()
+    text = ''
+    n = len('<s>') + len(formatted_prompt)
+    for word in streamer:
+        text += word
+        yield text[n:]
+    return text[n:]
 additional_inputs=[
     gr.Slider(
         label="Max new tokens",
         value=1024,
         interactive=True,
         info="The maximum numbers of new tokens",
     ),
     gr.Slider(
         label="Repetition penalty",
         value=1.2,
         interactive=True,
         info="Penalize repeated tokens",
     ),
 ]
 css = """
     )
 demo.queue(max_size=100).launch(debug=True)
+  : raisonnement mathématiques et scientifique"
+    ),
+]
+css = """
+  #mkd {
+    height: 500px;
+    overflow: auto;
+    border: 1px solid #ccc;
+  }
+"""
+with gr.Blocks(css=css) as demo:
+    gr.HTML("<h1><center>Mathstral Test</center><h1>")
+    gr.HTML("<h3><center>Dans cette démo, vous pouvez poser des questions mathématiques et scientifiques à Mathstral. 🧮</center><h3>")
+    gr.ChatInterface(
+        generate,
+        additional_inputs=additional_inputs,
+        theme = gr.themes.Soft(),
+        cache_examples=False,
+        examples=[ [l.strip()] for l in open("exercices.md").readlines()],
+        chatbot = gr.Chatbot(
+            latex_delimiters=[
+                {"left" : "$$", "right": "$$", "display": True },
+                {"left" : "\\[", "right": "\\]", "display": True },
+                {"left" : "\\(", "right": "\\)", "display": False },
+                {"left": "$", "right": "$", "display": False }
+                ]
+            )
+    )
+demo.queue(max_size=100).launch(debug=True)