Phi3-ORPO

Paused

App Files Files Community

eswardivi commited on Apr 18

Commit

7dc3087

•

1 Parent(s): 9b7a2bc

Added Better Inferencing techq

Browse files

Files changed (1) hide show

app.py +24 -12

app.py CHANGED Viewed

@@ -1,29 +1,36 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import os
 from threading import Thread
 import spaces
 token = os.environ["HF_TOKEN"]
 model = AutoModelForCausalLM.from_pretrained("google/gemma-1.1-7b-it",
-                                             # torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                                             torch_dtype=torch.float16,
                                              token=token)
-tok = AutoTokenizer.from_pretrained("google/gemma-1.1-7b-it",token=token)
-# using CUDA for an optimal experience
-# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 if torch.cuda.is_available():
     device = torch.device('cuda')
     print(f"Using GPU: {torch.cuda.get_device_name(device)}")
 else:
     device = torch.device('cpu')
     print("Using CPU")
 model = model.to(device)
 @spaces.GPU
 def chat(message, history):
     chat = []
     for item in history:
         chat.append({"role": "user", "content": item[0]})
@@ -31,7 +38,6 @@ def chat(message, history):
             chat.append({"role": "assistant", "content": item[1]})
     chat.append({"role": "user", "content": message})
     messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-    # Tokenize the messages string
     model_inputs = tok([messages], return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(
         tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
@@ -48,15 +54,21 @@ def chat(message, history):
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
-    # Initialize an empty string to store the generated text
     partial_text = ""
     for new_text in streamer:
-        # print(new_text)
         partial_text += new_text
-        # Yield an empty string to cleanup the message textbox and the updated conversation history
         yield partial_text
-demo = gr.ChatInterface(fn=chat, examples=[["Write me a poem about Machine Learning."]], title="gemma-1.1-7b-it")
 demo.launch()

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
 import os
 from threading import Thread
 import spaces
+import time
 token = os.environ["HF_TOKEN"]
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16
+)
 model = AutoModelForCausalLM.from_pretrained("google/gemma-1.1-7b-it",
+                                             quantization_config=quantization_config,
                                              token=token)
+tok = AutoTokenizer.from_pretrained("google/gemma-1.1-7b-it", token=token)
 if torch.cuda.is_available():
     device = torch.device('cuda')
     print(f"Using GPU: {torch.cuda.get_device_name(device)}")
 else:
     device = torch.device('cpu')
     print("Using CPU")
 model = model.to(device)
+model = model.to_bettertransformer()
 @spaces.GPU
 def chat(message, history):
+    start_time = time.time()
     chat = []
     for item in history:
         chat.append({"role": "user", "content": item[0]})
             chat.append({"role": "assistant", "content": item[1]})
     chat.append({"role": "user", "content": message})
     messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
     model_inputs = tok([messages], return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(
         tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     partial_text = ""
+    first_token_time = None
     for new_text in streamer:
+        if not first_token_time:
+            first_token_time = time.time() - start_time
         partial_text += new_text
         yield partial_text
+    total_time = time.time() - start_time
+    tokens = len(tok.tokenize(partial_text))
+    tokens_per_second = tokens / total_time if total_time > 0 else 0
+    # Append the timing information to the final output
+    timing_info = f"\nTime taken to first token: {first_token_time:.2f} seconds\nTokens per second: {tokens_per_second:.2f}"
+    yield partial_text + timing_info
+demo = gr.ChatInterface(fn=chat, examples=[["Write me a poem about Machine Learning."]], title="Chat With LLMS")
 demo.launch()