import gradio as gr import spaces import torch from torch.cuda.amp import autocast import subprocess from huggingface_hub import InferenceClient import os import psutil import json import subprocess from threading import Thread import torch import spaces import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) from transformers import AutoConfig, AutoModel """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch from accelerate import Accelerator subprocess.run( "pip install psutil", shell=True, ) import bitsandbytes as bnb # Import bitsandbytes for 8-bit quantization from datetime import datetime subprocess.run( "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True, ) # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") # pip install 'git+https://github.com/huggingface/transformers.git' token=os.getenv('token') print('token = ',token) from transformers import AutoModelForCausalLM, AutoTokenizer import transformers # model_id = "mistralai/Mistral-7B-v0.3" # model_id = "microsoft/Phi-3-medium-4k-instruct" # # model_id = "microsoft/phi-4" # # model_id = "Qwen/Qwen2-7B-Instruct" # tokenizer = AutoTokenizer.from_pretrained( # # model_id # model_id, # # use_fast=False # token= token, # trust_remote_code=True) # accelerator = Accelerator() # model = AutoModelForCausalLM.from_pretrained(model_id, token= token, # # torch_dtype= torch.uint8, # torch_dtype=torch.bfloat16, # # load_in_8bit=True, # # # # torch_dtype=torch.fl, # attn_implementation="flash_attention_2", # low_cpu_mem_usage=True, # trust_remote_code=True, # device_map='cuda', # # device_map=accelerator.device_map, # ) # # # model = accelerator.prepare(model) # from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline # pipe = pipeline( # "text-generation", # model=model, # tokenizer=tokenizer, # ) # pipeline = transformers.pipeline( # "text-generation", # model="microsoft/phi-4", # model_kwargs={"torch_dtype": "auto"}, # device_map="auto", # ) # device_map = infer_auto_device_map(model, max_memory={0: "79GB", "cpu":"65GB" }) # Load the model with the inferred device map # model = load_checkpoint_and_dispatch(model, model_id, device_map=device_map, no_split_module_classes=["GPTJBlock"]) # model.half() # MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B" MODEL_ID = "microsoft/phi-4" CHAT_TEMPLATE = "ŮŽAuto" MODEL_NAME = MODEL_ID.split("/")[-1] CONTEXT_LENGTH = 16000 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, # quantization_config=quantization_config, attn_implementation="flash_attention_2", ) accelerator = Accelerator() model = accelerator.prepare(model) import json def str_to_json(str_obj): json_obj = json.loads(str_obj) return json_obj @spaces.GPU(duration=60) def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p): messages = [] json_obj = str_to_json(message) print(json_obj) messages= json_obj stop_tokens = ["<|endoftext|>", "<|im_end|>"] instruction ="" #'<|im_start|>system\n' + system_message + '\n<|im_end|>\n' for qq in messages: role= qq['role'] content= qq['content'] instruction+= f'<|im_start|>{role}<|im_sep|>\n{content}\n<|im_end|>\n' instruction+='<|im_start|>assistant<|im_sep|>\n' # for user, assistant in history: # instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n' # instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n' print(instruction) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True) input_ids, attention_mask = enc.input_ids, enc.attention_mask if input_ids.shape[1] > CONTEXT_LENGTH: input_ids = input_ids[:, -CONTEXT_LENGTH:] attention_mask = attention_mask[:, -CONTEXT_LENGTH:] generate_kwargs = dict( input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), streamer=streamer, do_sample=True, temperature=temperature, max_new_tokens=max_tokens, top_k=40, repetition_penalty=1.1, top_p=0.95 ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() outputs= "" for new_token in streamer: print(new_token," ") outputs = outputs+ new_token print("output ",outputs) yield outputs # yield 'retuend' # model.to(accelerator.device) # messages = [] # json_obj = str_to_json(message) # print(json_obj) # messages= json_obj # # input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(accelerator.device) # # input_ids2 = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt") #.to('cuda') # # print(f"Converted input_ids dtype: {input_ids.dtype}") # # input_str= str(input_ids2) # # print('input str = ', input_str) # generation_args = { # "max_new_tokens": max_tokens, # "return_full_text": False, # "temperature": temperature, # "do_sample": False, # } # output = pipe(messages, **generation_args) # print(output[0]['generated_text']) # gen_text=output[0]['generated_text'] # # with torch.no_grad(): # # gen_tokens = model.generate( # # input_ids, # # max_new_tokens=max_tokens, # # # do_sample=True, # # temperature=temperature, # # ) # # gen_text = tokenizer.decode(gen_tokens[0]) # # print(gen_text) # # gen_text= gen_text.replace(input_str,'') # # gen_text= gen_text.replace('<|im_end|>','') # yield gen_text # messages = [ # # {"role": "user", "content": "What is your favourite condiment?"}, # # {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"}, # # {"role": "user", "content": "Do you have mayonnaise recipes?"} # ] # inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda") # outputs = model.generate(inputs, max_new_tokens=2000) # gen_text=tokenizer.decode(outputs[0], skip_special_tokens=True) # print(gen_text) # yield gen_text # for val in history: # if val[0]: # messages.append({"role": "user", "content": val[0]}) # if val[1]: # messages.append({"role": "assistant", "content": val[1]}) # messages.append({"role": "user", "content": message}) # response = "" # for message in client.chat_completion( # messages, # max_tokens=max_tokens, # stream=True, # temperature=temperature, # top_p=top_p, # ): # token = message.choices[0].delta.content # response += token # yield response """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) if __name__ == "__main__": demo.launch()