RegBot4.1

Sleeping

RegBot4.1 / models /llamaCustom.py

Zwea Htet

integrated open source llms

f5254ad 9 months ago

6.16 kB

	import os
	import pickle
	from json import dumps, loads
	import time
	from typing import Any, List, Mapping, Optional

	import numpy as np
	import openai
	import pandas as pd
	import streamlit as st
	from dotenv import load_dotenv
	from huggingface_hub import HfFileSystem

	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, Pipeline

	# prompts
	from assets.prompts import custom_prompts

	# llama index
	from llama_index.core import (
	StorageContext,
	SimpleDirectoryReader,
	VectorStoreIndex,
	load_index_from_storage,
	PromptHelper,
	PromptTemplate,
	)
	from llama_index.core.llms import (
	CustomLLM,
	CompletionResponse,
	LLMMetadata,
	)
	from llama_index.core.memory import ChatMemoryBuffer
	from llama_index.core.llms.callbacks import llm_completion_callback
	from llama_index.core.base.llms.types import ChatMessage
	from llama_index.core import Settings

	load_dotenv()
	# openai.api_key = os.getenv("OPENAI_API_KEY")
	fs = HfFileSystem()

	# define prompt helper
	# set maximum input size
	CONTEXT_WINDOW = 2048
	# set number of output tokens
	NUM_OUTPUT = 525
	# set maximum chunk overlap
	CHUNK_OVERLAP_RATION = 0.2

	# TODO: use the following prompt to format the answer at the end of the context prompt
	ANSWER_FORMAT = """
	Use the following example format for your answer:
	[FORMAT]
	Answer:
	The answer to the user question.
	Reference:
	The list of references to the specific sections of the documents that support your answer.
	[END_FORMAT]
	"""

	# query engine templates
	QUERY_ENGINE_QA_TEMPLATE = """
	We have provided context information below:
	[CONTEXT]
	{context_str}
	[END_CONTEXT]
	Given this information, please answer the following question:
	[QUESTION]
	{query_str}
	[END_QUESTION]
	"""

	QUERY_ENGINE_REFINE_TEMPLATE = """
	The original query is as follows:
	[QUESTION]
	{query_str}
	[END_QUESTION]

	We have providec an existing answer:
	[ANSWER]
	{existing_answer}
	[END_ANSWER]

	We have the opportunity to refine the existing answer (only if needed) with some more
	context below.
	[CONTEXT]
	{context_msg}
	[END_CONTEXT]

	Given the new context, refine the original answer to include more details like references \
	to the specific sections of the documents that support your answer.

	Refined Answer:
	"""


	CHAT_ENGINE_CONTEXT_PROMPT_TEMPLATE = """
	The following is a friendly conversation between a user and an AI assistant.
	The assistant is talkative and provides lots of specific details from its context.
	If the assistant does not know the answer to a question, it truthfully says it
	does not know.

	Here are the relevant documents for the context:

	{context_str}

	Instruction: Based on the above documents, provide a detailed answer for the user question below. \
	Include references to the specific sections of the documents that support your answer. \
	Answer "don't know" if not present in the document.
	"""

	CHAT_ENGINE_CONDENSE_PROMPT_TEMPLATE = """
	Given the following conversation between a user and an AI assistant and a follow up question from user,
	rephrase the follow up question to be a standalone question.

	Chat History:
	{chat_history}
	Follow Up Input: {question}
	Standalone question:
	"""


	@st.cache_resource
	def load_model(model_name: str):
	# llm_model_name = "bigscience/bloom-560m"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name, config="T5Config")

	pipe = pipeline(
	task="text-generation",
	model=model,
	tokenizer=tokenizer,
	# device=0, # GPU device number
	# max_length=512,
	do_sample=True,
	top_p=0.95,
	top_k=50,
	temperature=0.7,
	)

	return pipe


	class OurLLM(CustomLLM):
	context_window: int = 3900
	num_output: int = 256
	model_name: str = ""
	pipeline: Pipeline = None

	@property
	def metadata(self) -> LLMMetadata:
	"""Get LLM metadata."""
	return LLMMetadata(
	context_window=CONTEXT_WINDOW,
	num_output=NUM_OUTPUT,
	model_name=self.model_name,
	)

	# The decorator is optional, but provides observability via callbacks on the LLM calls.
	@llm_completion_callback()
	def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
	prompt_length = len(prompt)
	response = self.pipeline(prompt, max_new_tokens=NUM_OUTPUT)[0]["generated_text"]

	# only return newly generated tokens
	text = response[prompt_length:]
	return CompletionResponse(text=text)

	@llm_completion_callback()
	def stream_complete(self, prompt: str, **kwargs: Any):
	response = ""
	for token in self.dummy_response:
	response += token
	yield CompletionResponse(text=response, delta=token)


	class LlamaCustom:
	def __init__(self, model_name: str, index: VectorStoreIndex):
	self.model_name = model_name
	self.index = index
	self.chat_mode = "condense_plus_context"
	self.memory = ChatMemoryBuffer.from_defaults()
	self.verbose = True

	def get_response(self, query_str: str, chat_history: List[ChatMessage]):
	# https://docs.llamaindex.ai/en/stable/module_guides/deploying/chat_engines/
	query_engine = self.index.as_query_engine(
	text_qa_template=PromptTemplate(QUERY_ENGINE_QA_TEMPLATE),
	refine_template=PromptTemplate(QUERY_ENGINE_REFINE_TEMPLATE),
	verbose=self.verbose,
	)
	# chat_engine = self.index.as_chat_engine(
	# chat_mode=self.chat_mode,
	# memory=self.memory,
	# context_prompt=CHAT_ENGINE_CONTEXT_PROMPT_TEMPLATE,
	# condense_prompt=CHAT_ENGINE_CONDENSE_PROMPT_TEMPLATE,
	# # verbose=True,
	# )
	response = query_engine.query(query_str)
	# response = chat_engine.chat(message=query_str, chat_history=chat_history)

	return str(response)

	def get_stream_response(self, query_str: str, chat_history: List[ChatMessage]):
	response = self.get_response(query_str=query_str, chat_history=chat_history)
	for word in response.split():
	yield word + " "
	time.sleep(0.05)