Spaces:

hosseinhimself
/

ISANG-1.0-8B

Runtime error

App Files Files Community

hosseinhimself commited on 29 days ago

Commit

fa5cabf

verified ·

1 Parent(s): 8a7fb58

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -68

app.py CHANGED Viewed

@@ -1,81 +1,73 @@
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import os
-os.system("pip install -U bitsandbytes")
-model_name = "hosseinhimself/ISANG-v1.0-8B"
-# Ensure CUDA is not used
-torch.set_default_device("cpu")
-# Load tokenizer globally
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-def load_model():
-    try:
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.float32,  # Ensure compatibility with CPU
-            trust_remote_code=True,
-            low_cpu_mem_usage=True  # Optimize memory usage
         )
-        model.to("cpu")  # Explicitly load the model on CPU
-        print("Model loaded successfully on CPU.")
-        return model
     except Exception as e:
-        print(f"Error loading model: {e}")
-        raise
-def stream_chat(prompt, history):
-    model = load_model()
-    # Add system prompt
-    system_prompt = "You are ISANG, a multilingual large language model made by ISANG AI. You only respond in Persian, Korean, or English. If a user uses one of these languages, reply in the same language."
-    # Format the conversation history with system prompt
-    context = f"System: {system_prompt}\n"
-    for user_message, bot_message in history:
-        context += f"User: {user_message}\nBot: {bot_message}\n"
-    context += f"User: {prompt}\nBot:"
-    # Generate a response incrementally
-    inputs = tokenizer(context, return_tensors="pt", truncation=True, max_length=512)
-    output_ids = model.generate(
-        **inputs,
-        max_new_tokens=200,
-        temperature=0.7,
-        do_sample=True,
-        return_dict_in_generate=True,
-        output_scores=False
     )
-    response_ids = output_ids.sequences[0]
-    decoded_text = tokenizer.decode(response_ids, skip_special_tokens=True)
-    # Stream response word by word
-    response = decoded_text[len(context):].strip()
-    words = response.split()
-    history.append((prompt, ""))  # Add the prompt to history with an empty response initially
-    for i, word in enumerate(words):
-        # Append the next word to the history
-        history[-1] = (prompt, " ".join(words[: i + 1]))
-        yield history, " ".join(words[: i + 1])  # Stream the current response
-gradio_app = gr.Interface(
-    fn=stream_chat,
-    inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), "state"],
-    outputs=["state", "text"],
-    title="ISANG Chatbot",
-    description="This is a chatbot powered by the ISANG model. Enter your messages to chat with it!",
-    examples=[
-        ["سلام، چطوری؟"],
-        ["برام یه داستان تعریف کن"],
-        ["نظرت درباره هوش مصنوعی چیه؟"]
-    ],
-    live=True  # Enable live streaming for Gradio
-)
 if __name__ == "__main__":
-    gradio_app.launch()

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
+# Load the model and tokenizer
+def load_model():
+    model_name = "hosseinhimself/ISANG-v1.0-8B"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)
+    return tokenizer, model
+tokenizer, model = load_model()
+# Define the inference function
+def chat(input_text, max_tokens, temperature, history):
+    try:
+        # Concatenate history to the input prompt
+        if history:
+            input_text = history + "\nUser: " + input_text + "\nAssistant:"
+        else:
+            input_text = "User: " + input_text + "\nAssistant:"
+        inputs = tokenizer.encode(input_text, return_tensors="pt")
+        # Generate the response
+        output = model.generate(
+            inputs,
+            max_length=inputs.shape[1] + max_tokens,
+            temperature=temperature,
+            do_sample=True
         )
+        output_text = tokenizer.decode(output[0], skip_special_tokens=True)
+        # Extract the assistant's reply and update history
+        response_start = output_text.find("Assistant:") + len("Assistant:")
+        response = output_text[response_start:].strip()
+        new_history = input_text + response
+        return response, new_history
     except Exception as e:
+        return str(e), ""
+# Gradio interface
+def reset_history():
+    return ""
+with gr.Blocks() as demo:
+    gr.Markdown("# ISANG-v1.0-8B Chatbot")
+    with gr.Row():
+        with gr.Column():
+            user_input = gr.Textbox(label="Your Input", placeholder="Type your message here...")
+            max_tokens = gr.Slider(minimum=10, maximum=512, value=256, label="Max Tokens")
+            temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature")
+            history = gr.Textbox(label="Conversation History", value="", visible=False)
+            send_button = gr.Button("Send")
+            clear_button = gr.Button("Clear History")
+        with gr.Column():
+            chat_output = gr.Textbox(label="Assistant's Response", lines=10)
+    send_button.click(
+        chat,
+        inputs=[user_input, max_tokens, temperature, history],
+        outputs=[chat_output, history]
     )
+    clear_button.click(reset_history, outputs=[history])
 if __name__ == "__main__":
+    demo.launch(enable_queue=True, share=True)