Spaces:

contenteaseAI
/

LargeLanguageModel

Build error

App Files Files Community

ShravanHN commited on Jun 24, 2024

Commit

4efef34

•

1 Parent(s): 557609c

added chunks if tokens are more

Browse files

Files changed (1) hide show

app.py +88 -25

app.py CHANGED Viewed

@@ -1,14 +1,18 @@
-import spaces
 import gradio as gr
 import os
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
 import torch
 from threading import Thread
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DESCRIPTION = '''
 <div>
 <h1 style="text-align: center;">ContenteaseAI custom trained model</h1>
@@ -17,7 +21,6 @@ DESCRIPTION = '''
 LICENSE = """
 <p/>
 ---
 For more information, visit our [website](https://contentease.ai).
 """
@@ -29,14 +32,13 @@ PLACEHOLDER = """
 </div>
 """
 css = """
 h1 {
   text-align: center;
   display: block;
 }
 """
 # Load the tokenizer and model with quantization
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 bnb_config = BitsAndBytesConfig(
@@ -46,14 +48,21 @@ bnb_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16
 )
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    quantization_config=bnb_config,
-    torch_dtype=torch.bfloat16
-)
-model.generation_config.pad_token_id = tokenizer.pad_token_id
 terminators = [
     tokenizer.eos_token_id,
@@ -67,25 +76,41 @@ Bad JSON example: {'lobby': { 'frcm': { 'replace': [ 'carpet', 'carpet_pad', 'ba
 Make sure to fetch details from the provided text and ignore unnecessary information. The response should be in JSON format only, without any additional comments.
 """
-@spaces.GPU(duration=120)
-def chat_llama3_8b(message: str, history: list, temperature: float, max_new_tokens: int):
     """
-    Generate a streaming response using the llama3-8b model.
     Args:
-        message (str): The input message.
-        history (list): The conversation history used by ChatInterface.
-        temperature (float): The temperature for generating the response.
-        max_new_tokens (int): The maximum number of new tokens to generate.
     Returns:
-        str: The generated response.
     """
-    conversation = [{"role": "system", "content": SYS_PROMPT}]
     for user, assistant in history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-    conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
@@ -109,8 +134,43 @@ def chat_llama3_8b(message: str, history: list, temperature: float, max_new_toke
     outputs = []
     for text in streamer:
         outputs.append(text)
-        yield "".join(outputs)
 # Gradio block
 chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
@@ -132,4 +192,7 @@ with gr.Blocks(fill_height=True, css=css) as demo:
     gr.Markdown(LICENSE)
 if __name__ == "__main__":
-    demo.launch(show_error=True, debug=True)

 import gradio as gr
 import os
+import time
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
 import torch
 from threading import Thread
+import logging
+import spaces
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DESCRIPTION = '''
 <div>
 <h1 style="text-align: center;">ContenteaseAI custom trained model</h1>
 LICENSE = """
 <p/>
 ---
 For more information, visit our [website](https://contentease.ai).
 """
 </div>
 """
 css = """
 h1 {
   text-align: center;
   display: block;
 }
 """
 # Load the tokenizer and model with quantization
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 bnb_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16
 )
+try:
+    logger.info("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    logger.info("Loading model...")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        device_map="auto",
+        quantization_config=bnb_config,
+        torch_dtype=torch.bfloat16
+    )
+    model.generation_config.pad_token_id = tokenizer.pad_token_id
+    logger.info("Model and tokenizer loaded successfully.")
+except Exception as e:
+    logger.error(f"Error loading model or tokenizer: {e}")
+    raise
 terminators = [
     tokenizer.eos_token_id,
 Make sure to fetch details from the provided text and ignore unnecessary information. The response should be in JSON format only, without any additional comments.
 """
+def chunk_text(text, chunk_size=4000):
     """
+    Splits the input text into chunks of specified size.
     Args:
+        text (str): The input text to be chunked.
+        chunk_size (int): The size of each chunk in tokens.
     Returns:
+        list: A list of text chunks.
     """
+    words = text.split()
+    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+    return chunks
+def combine_responses(responses):
+    """
+    Combines the responses from all chunks into a final output string.
+    Args:
+        responses (list): A list of responses from each chunk.
+    Returns:
+        str: The combined output string.
+    """
+    combined_output = " ".join(responses)
+    return combined_output
+def generate_response_for_chunk(chunk, history, temperature, max_new_tokens):
+    start_time = time.time()
+    conversation = [{"role": "system", "content": SYS_PROMPT}]
     for user, assistant in history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": chunk})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
     outputs = []
     for text in streamer:
         outputs.append(text)
+    end_time = time.time()
+    logger.info(f"Time taken for generating response for a chunk: {end_time - start_time} seconds")
+    return "".join(outputs)
+@spaces.GPU(duration=120)
+def chat_llama3_8b(message: str, history: list, temperature: float, max_new_tokens: int):
+    """
+    Generate a streaming response using the llama3-8b model with chunking.
+    Args:
+        message (str): The input message.
+        history (list): The conversation history used by ChatInterface.
+        temperature (float): The temperature for generating the response.
+        max_new_tokens (int): The maximum number of new tokens to generate.
+    Returns:
+        str: The generated response.
+    """
+    try:
+        start_time = time.time()
+        chunks = chunk_text(message)
+        responses = []
+        for chunk in chunks:
+            response = generate_response_for_chunk(chunk, history, temperature, max_new_tokens)
+            responses.append(response)
+        final_output = combine_responses(responses)
+        end_time = time.time()
+        logger.info(f"Total time taken for generating response: {end_time - start_time} seconds")
+        yield final_output
+    except Exception as e:
+        logger.error(f"Error generating response: {e}")
+        yield "An error occurred while generating the response. Please try again."
 # Gradio block
 chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
     gr.Markdown(LICENSE)
 if __name__ == "__main__":
+    try:
+        demo.launch(show_error=True)
+    except Exception as e:
+        logger.error(f"Error launching Gradio demo: {e}")