gemma-2-9b-it1

Runtime error

App Files Files Community

Leri777 commited on Oct 9, 2024

Commit

2f7b0a4

•

1 Parent(s): 4aae838

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -54

app.py CHANGED Viewed

@@ -1,81 +1,123 @@
 import os
 from threading import Thread
 from typing import Iterator
 import gradio as gr
 import spaces
 import torch
-from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer
-DESCRIPTION = """\
-# Gemma 2 9B IT
 Gemma 2 is Google's latest iteration of open LLMs.
 This is a demo of [`google/gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it), fine-tuned for instruction following.
-For more details, please check [our post](https://huggingface.co/blog/gemma2).
-👉 Looking for a larger and more powerful version? Try the 27B version in [HuggingChat](https://huggingface.co/chat/models/google/gemma-2-27b-it).
 """
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model_id = "google/gemma-2-9b-it"
 tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-)
-model.config.sliding_window = 4096
-model.eval()
-@spaces.GPU(duration=90)
-def generate(
-    message: str,
-    chat_history: list[dict],
-    max_new_tokens: int = 1024,
-    temperature: float = 0.6,
-    top_p: float = 0.9,
-    top_k: int = 50,
-    repetition_penalty: float = 1.2,
-) -> Iterator[str]:
-    conversation = chat_history.copy()
-    conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-    input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        {"input_ids": input_ids},
-        streamer=streamer,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        top_p=top_p,
-        top_k=top_k,
-        temperature=temperature,
-        num_beams=1,
-        repetition_penalty=repetition_penalty,
     )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    outputs = []
-    for text in streamer:
-        outputs.append(text)
-        yield "".join(outputs)
 chat_interface = gr.ChatInterface(
-    fn=generate,
     additional_inputs=[
         gr.Slider(
             label="Max new tokens",
@@ -132,3 +174,5 @@ with gr.Blocks(css="style.css", fill_height=True) as demo:
 if __name__ == "__main__":
     demo.queue(max_size=20).launch()

 import os
 from threading import Thread
 from typing import Iterator
+import logging
+from logging.handlers import RotatingFileHandler
 import gradio as gr
 import spaces
 import torch
+from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer, pipeline
+from langchain_huggingface import HuggingFacePipeline
+from langchain.prompts import PromptTemplate
+from langchain.chains import LLMChain
+# Logging setup
+log_file = '/tmp/app_debug.log'
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+file_handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5)
+file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+logger.addHandler(file_handler)
+logger.debug("Application started")
+DESCRIPTION = """
+# Gemma 2 9B IT with LangChain Integration
 Gemma 2 is Google's latest iteration of open LLMs.
 This is a demo of [`google/gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it), fine-tuned for instruction following.
+Now integrated with LangChain for enhanced interaction capabilities.
 """
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 model_id = "google/gemma-2-9b-it"
 tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
+# Load model with GPU availability check
+if torch.cuda.is_available():
+    logger.debug("GPU is available. Proceeding with GPU setup.")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+    )
+else:
+    logger.warning("GPU is not available. Proceeding with CPU setup.")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        device_map="auto",
+        low_cpu_mem_usage=True,
     )
+model.config.sliding_window = 4096
+model.eval()
+# Create Hugging Face pipeline
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_length=MAX_MAX_NEW_TOKENS,
+    temperature=0.7,
+    top_k=50,
+    top_p=0.9,
+    repetition_penalty=1.2,
+)
+# Initialize HuggingFacePipeline model for LangChain
+chat_model = HuggingFacePipeline(pipeline=pipe)
+logger.debug("Model and tokenizer loaded successfully")
+# Define the conversation template for LangChain
+template = """<|im_start|>system
+{system_prompt}
+<|im_end|>
+{history}
+<|im_start|>user
+{human_input}
+<|im_end|>
+<|im_start|>assistant"""
+# Create LangChain prompt and chain
+prompt = PromptTemplate(
+    template=template, input_variables=["system_prompt", "history", "human_input"]
+)
+chain = LLMChain(llm=chat_model, prompt=prompt)
+# Prediction function using LangChain and model
+def predict(
+    message,
+    chat_history,
+    max_new_tokens,
+    temperature,
+    top_p,
+    top_k,
+    repetition_penalty,
+):
+    formatted_history = "\n".join(
+        [f"<|im_start|>{entry['role']}\n{entry['content']}<|im_end|>" for entry in chat_history]
+    )
+    system_prompt = "You are a helpful coding assistant."
+    try:
+        result = chain.run(
+            {
+                "system_prompt": system_prompt,
+                "history": formatted_history,
+                "human_input": message,
+            }
+        )
+        return result
+    except Exception as e:
+        logger.exception(f"Error during prediction: {e}")
+        return "An error occurred."
+# Gradio UI
 chat_interface = gr.ChatInterface(
+    fn=predict,
     additional_inputs=[
         gr.Slider(
             label="Max new tokens",
 if __name__ == "__main__":
     demo.queue(max_size=20).launch()
+logger.debug("Chat interface initialized and launched")