|
import os |
|
import spaces |
|
from langchain.memory import ConversationBufferMemory,ConversationSummaryBufferMemory |
|
from langchain.chains import ConversationChain |
|
import langchain.globals |
|
from langchain.prompts import PromptTemplate, ChatPromptTemplate |
|
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline |
|
|
|
|
|
my_model_id = os.getenv('MODEL_REPO_ID', 'Default Value') |
|
token = os.getenv('HUGGINGFACEHUB_API_TOKEN') |
|
|
|
template = """You are an AI having conversation with a human. Below is an instruction that describes a task. |
|
Write a response that appropriately completes the request. |
|
Reply with the most helpful and logic answer. During the conversation you need to ask the user |
|
the following questions to complete the hotel booking task. |
|
1) Where would you like to stay and when? |
|
2) How many people are staying in the room? |
|
3) Do you prefer any ammenities like breakfast included or gym? |
|
4) What is your name, your email address and phone number? |
|
Make sure you receive a logical answer from the user from every question to complete the hotel |
|
booking process. |
|
|
|
Relevant Information: |
|
|
|
|
|
{history} |
|
|
|
Current Conversation: |
|
|
|
Human: {input} |
|
AI:""" |
|
|
|
|
|
@spaces.GPU |
|
def load_model(): |
|
quantization_config = BitsAndBytesConfig( |
|
load_in_8bit=True, |
|
|
|
) |
|
tokenizer = AutoTokenizer.from_pretrained(my_model_id) |
|
model = AutoModelForCausalLM.from_pretrained(my_model_id, device_map="auto",quantization_config=quantization_config) |
|
|
|
return tokenizer,model |
|
|
|
|
|
@spaces.GPU |
|
def load_pipeline(): |
|
tokenizer, model = load_model() |
|
pipe = pipeline("text-generation", |
|
model= model, |
|
tokenizer = tokenizer, |
|
|
|
top_k = 30, |
|
top_p = 0.7, |
|
early_stopping=True, |
|
num_beams = 2, |
|
temperature = 0.05, |
|
repetition_penalty = 1.05) |
|
|
|
llm = HuggingFacePipeline(pipeline = pipe) |
|
return llm |
|
|
|
|
|
|
|
|
|
llm = load_pipeline() |
|
|
|
def demo_miny_memory(): |
|
|
|
memory = ConversationSummaryBufferMemory(llm = llm, memory_key = "history") |
|
return memory |
|
|
|
@spaces.GPU |
|
def demo_chain(input_text,history): |
|
|
|
PROMPT = PromptTemplate(template=template, input_variables=["history", "input"]) |
|
conversation = ConversationChain( |
|
llm=llm, |
|
prompt=PROMPT, |
|
|
|
verbose=True, |
|
memory=demo_miny_memory() |
|
) |
|
|
|
chat_reply = conversation.invoke({ |
|
"input" : input_text, |
|
"history" : history |
|
}, return_only_outputs=True) |
|
return chat_reply |