Spaces:
Runtime error
Runtime error
import transformers | |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig | |
import torch | |
from langchain.llms import HuggingFacePipeline | |
import gradio as gr | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
print("Device:", device) | |
if device == 'cuda': | |
print(torch.cuda.get_device_name(0)) | |
origin_model_path = "mistralai/Mistral-7B-Instruct-v0.1" | |
model_path = "filipealmeida/Mistral-7B-Instruct-v0.1-sharded" | |
bnb_config = BitsAndBytesConfig \ | |
( | |
load_in_4bit=True, | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.bfloat16, | |
) | |
model = AutoModelForCausalLM.from_pretrained (model_path, trust_remote_code=True, | |
quantization_config=bnb_config, | |
device_map="auto") | |
tokenizer = AutoTokenizer.from_pretrained(origin_model_path) | |
text_generation_pipeline = transformers.pipeline( | |
model=model, | |
tokenizer=tokenizer, | |
task="text-generation", | |
eos_token_id=tokenizer.eos_token_id, | |
pad_token_id=tokenizer.eos_token_id, | |
repetition_penalty=1.1, | |
return_full_text=True, | |
max_new_tokens=100, | |
temperature = 0.5, | |
do_sample=True, | |
) | |
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline) | |
def get_response(message, history): | |
return mistral_llm.invoke(message) | |
demo = gr.ChatInterface(get_response) | |
demo.launch() |