|
import os |
|
import pickle |
|
from json import dumps, loads |
|
import time |
|
from typing import Any, List, Mapping, Optional |
|
|
|
import numpy as np |
|
import openai |
|
import pandas as pd |
|
import streamlit as st |
|
from dotenv import load_dotenv |
|
from huggingface_hub import HfFileSystem |
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, Pipeline |
|
|
|
|
|
from assets.prompts import custom_prompts |
|
|
|
|
|
from llama_index.core import ( |
|
StorageContext, |
|
SimpleDirectoryReader, |
|
VectorStoreIndex, |
|
load_index_from_storage, |
|
PromptHelper, |
|
PromptTemplate, |
|
) |
|
from llama_index.core.llms import ( |
|
CustomLLM, |
|
CompletionResponse, |
|
LLMMetadata, |
|
) |
|
from llama_index.core.memory import ChatMemoryBuffer |
|
from llama_index.core.llms.callbacks import llm_completion_callback |
|
from llama_index.core.base.llms.types import ChatMessage |
|
from llama_index.core import Settings |
|
|
|
load_dotenv() |
|
|
|
fs = HfFileSystem() |
|
|
|
|
|
|
|
CONTEXT_WINDOW = 2048 |
|
|
|
NUM_OUTPUT = 525 |
|
|
|
CHUNK_OVERLAP_RATION = 0.2 |
|
|
|
|
|
ANSWER_FORMAT = """ |
|
Use the following example format for your answer: |
|
[FORMAT] |
|
Answer: |
|
The answer to the user question. |
|
Reference: |
|
The list of references to the specific sections of the documents that support your answer. |
|
[END_FORMAT] |
|
""" |
|
|
|
|
|
QUERY_ENGINE_QA_TEMPLATE = """ |
|
We have provided context information below: |
|
[CONTEXT] |
|
{context_str} |
|
[END_CONTEXT] |
|
Given this information, please answer the following question: |
|
[QUESTION] |
|
{query_str} |
|
[END_QUESTION] |
|
""" |
|
|
|
QUERY_ENGINE_REFINE_TEMPLATE = """ |
|
The original query is as follows: |
|
[QUESTION] |
|
{query_str} |
|
[END_QUESTION] |
|
|
|
We have providec an existing answer: |
|
[ANSWER] |
|
{existing_answer} |
|
[END_ANSWER] |
|
|
|
We have the opportunity to refine the existing answer (only if needed) with some more |
|
context below. |
|
[CONTEXT] |
|
{context_msg} |
|
[END_CONTEXT] |
|
|
|
Given the new context, refine the original answer to include more details like references \ |
|
to the specific sections of the documents that support your answer. |
|
|
|
Refined Answer: |
|
""" |
|
|
|
|
|
CHAT_ENGINE_CONTEXT_PROMPT_TEMPLATE = """ |
|
The following is a friendly conversation between a user and an AI assistant. |
|
The assistant is talkative and provides lots of specific details from its context. |
|
If the assistant does not know the answer to a question, it truthfully says it |
|
does not know. |
|
|
|
Here are the relevant documents for the context: |
|
|
|
{context_str} |
|
|
|
Instruction: Based on the above documents, provide a detailed answer for the user question below. \ |
|
Include references to the specific sections of the documents that support your answer. \ |
|
Answer "don't know" if not present in the document. |
|
""" |
|
|
|
CHAT_ENGINE_CONDENSE_PROMPT_TEMPLATE = """ |
|
Given the following conversation between a user and an AI assistant and a follow up question from user, |
|
rephrase the follow up question to be a standalone question. |
|
|
|
Chat History: |
|
{chat_history} |
|
Follow Up Input: {question} |
|
Standalone question: |
|
""" |
|
|
|
|
|
@st.cache_resource |
|
def load_model(model_name: str): |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForCausalLM.from_pretrained(model_name, config="T5Config") |
|
|
|
pipe = pipeline( |
|
task="text-generation", |
|
model=model, |
|
tokenizer=tokenizer, |
|
|
|
|
|
do_sample=True, |
|
top_p=0.95, |
|
top_k=50, |
|
temperature=0.7, |
|
) |
|
|
|
return pipe |
|
|
|
|
|
class OurLLM(CustomLLM): |
|
context_window: int = 3900 |
|
num_output: int = 256 |
|
model_name: str = "" |
|
pipeline: Pipeline = None |
|
|
|
@property |
|
def metadata(self) -> LLMMetadata: |
|
"""Get LLM metadata.""" |
|
return LLMMetadata( |
|
context_window=CONTEXT_WINDOW, |
|
num_output=NUM_OUTPUT, |
|
model_name=self.model_name, |
|
) |
|
|
|
|
|
@llm_completion_callback() |
|
def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse: |
|
prompt_length = len(prompt) |
|
response = self.pipeline(prompt, max_new_tokens=NUM_OUTPUT)[0]["generated_text"] |
|
|
|
|
|
text = response[prompt_length:] |
|
return CompletionResponse(text=text) |
|
|
|
@llm_completion_callback() |
|
def stream_complete(self, prompt: str, **kwargs: Any): |
|
response = "" |
|
for token in self.dummy_response: |
|
response += token |
|
yield CompletionResponse(text=response, delta=token) |
|
|
|
|
|
class LlamaCustom: |
|
def __init__(self, model_name: str, index: VectorStoreIndex): |
|
self.model_name = model_name |
|
self.index = index |
|
self.chat_mode = "condense_plus_context" |
|
self.memory = ChatMemoryBuffer.from_defaults() |
|
self.verbose = True |
|
|
|
def get_response(self, query_str: str, chat_history: List[ChatMessage]): |
|
|
|
query_engine = self.index.as_query_engine( |
|
text_qa_template=PromptTemplate(QUERY_ENGINE_QA_TEMPLATE), |
|
refine_template=PromptTemplate(QUERY_ENGINE_REFINE_TEMPLATE), |
|
verbose=self.verbose, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response = query_engine.query(query_str) |
|
|
|
|
|
return str(response) |
|
|
|
def get_stream_response(self, query_str: str, chat_history: List[ChatMessage]): |
|
response = self.get_response(query_str=query_str, chat_history=chat_history) |
|
for word in response.split(): |
|
yield word + " " |
|
time.sleep(0.05) |
|
|