ShravanHN commited on
Commit
8410c86
·
1 Parent(s): 4efef34

modified the chunk limit and added error handing and caching of the model

Browse files
Files changed (1) hide show
  1. app.py +27 -14
app.py CHANGED
@@ -6,6 +6,8 @@ import torch
6
  from threading import Thread
7
  import logging
8
  import spaces
 
 
9
  # Set up logging
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
@@ -48,20 +50,31 @@ bnb_config = BitsAndBytesConfig(
48
  bnb_4bit_compute_dtype=torch.bfloat16
49
  )
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  try:
52
- logger.info("Loading tokenizer...")
53
- tokenizer = AutoTokenizer.from_pretrained(model_id)
54
- logger.info("Loading model...")
55
- model = AutoModelForCausalLM.from_pretrained(
56
- model_id,
57
- device_map="auto",
58
- quantization_config=bnb_config,
59
- torch_dtype=torch.bfloat16
60
- )
61
- model.generation_config.pad_token_id = tokenizer.pad_token_id
62
- logger.info("Model and tokenizer loaded successfully.")
63
  except Exception as e:
64
- logger.error(f"Error loading model or tokenizer: {e}")
65
  raise
66
 
67
  terminators = [
@@ -76,7 +89,7 @@ Bad JSON example: {'lobby': { 'frcm': { 'replace': [ 'carpet', 'carpet_pad', 'ba
76
  Make sure to fetch details from the provided text and ignore unnecessary information. The response should be in JSON format only, without any additional comments.
77
  """
78
 
79
- def chunk_text(text, chunk_size=4000):
80
  """
81
  Splits the input text into chunks of specified size.
82
 
@@ -185,7 +198,7 @@ with gr.Blocks(fill_height=True, css=css) as demo:
185
  additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
186
  additional_inputs=[
187
  gr.Slider(minimum=0, maximum=1, step=0.1, value=0.95, label="Temperature", render=False),
188
- gr.Slider(minimum=128, maximum=9012, step=1, value=512, label="Max new tokens", render=False),
189
  ]
190
  )
191
 
 
6
  from threading import Thread
7
  import logging
8
  import spaces
9
+ from functools import lru_cache
10
+
11
  # Set up logging
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
 
50
  bnb_4bit_compute_dtype=torch.bfloat16
51
  )
52
 
53
+ @lru_cache(maxsize=1)
54
+ def load_model_and_tokenizer():
55
+ try:
56
+ start_time = time.time()
57
+ logger.info("Loading tokenizer...")
58
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
59
+ logger.info("Loading model...")
60
+ model = AutoModelForCausalLM.from_pretrained(
61
+ model_id,
62
+ device_map="auto",
63
+ quantization_config=bnb_config,
64
+ torch_dtype=torch.bfloat16
65
+ )
66
+ model.generation_config.pad_token_id = tokenizer.pad_token_id
67
+ end_time = time.time()
68
+ logger.info(f"Model and tokenizer loaded successfully in {end_time - start_time} seconds.")
69
+ return model, tokenizer
70
+ except Exception as e:
71
+ logger.error(f"Error loading model or tokenizer: {e}")
72
+ raise
73
+
74
  try:
75
+ model, tokenizer = load_model_and_tokenizer()
 
 
 
 
 
 
 
 
 
 
76
  except Exception as e:
77
+ logger.error(f"Failed to load model and tokenizer: {e}")
78
  raise
79
 
80
  terminators = [
 
89
  Make sure to fetch details from the provided text and ignore unnecessary information. The response should be in JSON format only, without any additional comments.
90
  """
91
 
92
+ def chunk_text(text, chunk_size=5000):
93
  """
94
  Splits the input text into chunks of specified size.
95
 
 
198
  additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
199
  additional_inputs=[
200
  gr.Slider(minimum=0, maximum=1, step=0.1, value=0.95, label="Temperature", render=False),
201
+ gr.Slider(minimum=128, maximum=2000, step=1, value=700, label="Max new tokens", render=False),
202
  ]
203
  )
204