Spaces:

contenteaseAI
/

LargeLanguageModel

Build error

App Files Files Community

ShravanHN commited on Jun 25, 2024

Commit

8410c86

1 Parent(s): 4efef34

modified the chunk limit and added error handing and caching of the model

Browse files

Files changed (1) hide show

app.py +27 -14

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ import torch
 from threading import Thread
 import logging
 import spaces
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -48,20 +50,31 @@ bnb_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16
 )
 try:
-    logger.info("Loading tokenizer...")
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    logger.info("Loading model...")
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        device_map="auto",
-        quantization_config=bnb_config,
-        torch_dtype=torch.bfloat16
-    )
-    model.generation_config.pad_token_id = tokenizer.pad_token_id
-    logger.info("Model and tokenizer loaded successfully.")
 except Exception as e:
-    logger.error(f"Error loading model or tokenizer: {e}")
     raise
 terminators = [
@@ -76,7 +89,7 @@ Bad JSON example: {'lobby': { 'frcm': { 'replace': [ 'carpet', 'carpet_pad', 'ba
 Make sure to fetch details from the provided text and ignore unnecessary information. The response should be in JSON format only, without any additional comments.
 """
-def chunk_text(text, chunk_size=4000):
     """
     Splits the input text into chunks of specified size.
@@ -185,7 +198,7 @@ with gr.Blocks(fill_height=True, css=css) as demo:
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
             gr.Slider(minimum=0, maximum=1, step=0.1, value=0.95, label="Temperature", render=False),
-            gr.Slider(minimum=128, maximum=9012, step=1, value=512, label="Max new tokens", render=False),
         ]
     )

 from threading import Thread
 import logging
 import spaces
+from functools import lru_cache
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
     bnb_4bit_compute_dtype=torch.bfloat16
 )
+@lru_cache(maxsize=1)
+def load_model_and_tokenizer():
+    try:
+        start_time = time.time()
+        logger.info("Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        logger.info("Loading model...")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            device_map="auto",
+            quantization_config=bnb_config,
+            torch_dtype=torch.bfloat16
+        )
+        model.generation_config.pad_token_id = tokenizer.pad_token_id
+        end_time = time.time()
+        logger.info(f"Model and tokenizer loaded successfully in {end_time - start_time} seconds.")
+        return model, tokenizer
+    except Exception as e:
+        logger.error(f"Error loading model or tokenizer: {e}")
+        raise
 try:
+    model, tokenizer = load_model_and_tokenizer()
 except Exception as e:
+    logger.error(f"Failed to load model and tokenizer: {e}")
     raise
 terminators = [
 Make sure to fetch details from the provided text and ignore unnecessary information. The response should be in JSON format only, without any additional comments.
 """
+def chunk_text(text, chunk_size=5000):
     """
     Splits the input text into chunks of specified size.
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
             gr.Slider(minimum=0, maximum=1, step=0.1, value=0.95, label="Temperature", render=False),
+            gr.Slider(minimum=128, maximum=2000, step=1, value=700, label="Max new tokens", render=False),
         ]
     )