Chatbot / setup.py
edithram23's picture
initial commit
e900f80
raw
history blame
11.3 kB
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from retriever import Retriever
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
import os
import io
from langchain_qdrant import QdrantVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from openai import OpenAI
from groq import Groq
import soundfile as sf
from deepgram import DeepgramClient, SpeakOptions
from langchain_groq import ChatGroq
import hashlib
import time
from uuid import uuid4
from dotenv import load_dotenv
load_dotenv('.env')
class Script():
def __init__(self):
self.retriever = Retriever()
self.openai_client = ChatOpenAI(model="gpt-4o-mini",temperature=0.1)
self.groq = ChatGroq(model='llama3-70b-8192')
self.groq1 = ChatGroq(model='llama3-8b-8192')
def format_docs(self,format_results,id=False):
formatted_docs = []
for i,doc in enumerate(format_results,start=1):
if(id==True):
metadata = doc.metadata['DOCUMENT_NAME']
else:
metadata = doc.metadata['DOCUMENT_IS_ABOUT']
page = doc.page_content.strip()
content = f"**DOC {i}. METADATA : This DOC is about {metadata} \n CONTENT:{page}**"
formatted_docs.append(content)
return "".join(formatted_docs)
def history(self,hist):
text = ''
for i in hist:
if(i['content']!='Sorry! Unable to find an answer for your question. Try Again.'):
text += '|Role:'+i['role']+'Content:'+i['content']+'|'
def gpt_loaders(self,query:str,history:str):
template= f"""
# You are an excellent Question & Answering BOT based on Context.
# TASK : Given a question and the context, you are required to answer the question..
# User questions may be given as a user_query (or) User_question (or) User_scenario.
===============================
#USER_QUERY : {{question}}
===============================
#METADATA_OF_CONTEXT :
-> The context given is related to INDIAN-TAXATION.
#CONTEXT : {{context}}
===============================
You are also given previous ChatHistories (User question and corresponding AI answer) to you as extra data.
--# When to take the history as CONTEXT: Only if the history is relevant to the current question, you are permitted to take the chat history as a context.
--# If it is not relevant to the current question, do not take it.
#Chat History : {{history}}
===============================
-> Don't provide your own answer that is not in the given context.
-> If you can provide a similar answer from the context that may be relevant but not exactly correct for the question, you can provide that answer.
-> Try to provide a proper output for the question. Don't explain any questions too lengthy max[100 words].
===============================
# OUTPUT FORMAT:
-> Your output may be given to a voice model for a speech output. Try to be precise with your words. At the same time, fill the user with your answer.
-> Don't provide any further explanation apart from the answer output.
# STEP 1 : Generate a output for the query from the context:
# STEP 2 : -> Based on the current output check if it is relevant to the question again.
-> If you are not 100% able to answer the given question from the context => PROVIDE "Sorry! Unable to find an answer for your question. Try Again."
"""
# template = f"""ANSWER THE USER QUESTION BASED ON THE GIVEN CONTEXT ALONE.
# UESR QUESTION : {{question}}
# CONTEXT : {{context}}
# {{history}}
# """
rag_prompt = PromptTemplate.from_template(template)
rag_chain = (
rag_prompt
| self.openai_client
| StrOutputParser()
)
question ={"context": self.format_docs(self.retriever.multiple_contexts(query)), "question": query, "history": history}
return rag_chain,question
def gpt_loaders_id(self,query:str,history:str,id:str):
template= f"""
# You are an excellent Question & Answering BOT. Given a question and the context you will answer the question only based on the given context.
# You will be given a user_query (or) User_question (or) User_scenario.
# TASK: Your task is to provide an Answer to the USER_QUERY with the given CONTEXT_DATA.
===============================
#USER_QUERY : {{question}}
===============================
#METADATA_OF_CONTEXT : -> The context given is a given from the user pdf input.
-> Based on the user_query use the context accordingly.
#CONTEXT : {{context}}
===============================
You are also given previous ChatHistories (User question and corressponding AI answer) to you as an extra data.
--# When to take the history as CONTEXT : Only if the history is relevant to the current question you are permitted to take the chat history as a context.
--# If it is not relevant to the current question do not take it.
#Chat History : {{history}}
===============================
-> You are allowed to provide the answer only from the given context.
-> Don't provide your own answer that is not in the given context.
-> If you are not able to answer the given question from the context => PROVIDE "Sorry! Unable to find an answer for your question. Try Again."
-> Try to be a precise and provide a proper output for the question. Don't explain any questions too lengthy max[100 words].
-> Provide answer only to the question that is asked.
===============================
# OUTPUT FORMAT:
-> Your output may be given to a voice model for a speech output. Try to be precise with your words. At the same time, fill the user with your answer
-> Don't provide any etc explanation apart from the answer output.
"""
rag_prompt = PromptTemplate.from_template(template)
rag_chain = (
rag_prompt
| self.groq
| StrOutputParser()
)
question ={"context": self.format_docs(self.retriever.id_filter(query,id),id=True), "question": query, "history": history}
return rag_chain,question
class Vector_db():
def __init__(self):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1024,
chunk_overlap=256,
length_function=len,
is_separator_regex=False,
)
self.qdrant_client = QdrantClient(
url=os.getenv("QDRANT_URL"),
api_key=os.getenv("QDRANT_API_KEY")
)
self.openai_client = OpenAI()
def get_embed(self, texts):
return self.openai_client.embeddings.create(input = texts, model="text-embedding-3-large").data[0].embedding
def text_split(self,full_text,meta):
documents = self.text_splitter.create_documents([full_text],metadatas=[meta])
return documents
def load_data(self,pdf_path:str):
loader = PyPDFLoader(pdf_path)
file = loader.load()
text = ''
for i in file:
text+=i.page_content
return text
def getdocs(self,about,filename):
text = self.load_data(filename)
data = (text+str(time.time())).encode('utf-8')
identifier = hashlib.sha256(data).hexdigest()
metadata = {'DOCUMENT_NAME':about,'ID':str(identifier)}
documents = self.text_split(text,metadata)
return documents,identifier
def upload_pdfs_user(self,path,delete=False):
if delete==True:
if(self.qdrant_client.collection_exists("siel-ai-user")):
self.qdrant_client.delete_collection("siel-ai-user")
if(not(self.qdrant_client.collection_exists("siel-ai-user"))):
self.qdrant_client.create_collection(
collection_name="siel-ai-user",
vectors_config=VectorParams(size=1536,
distance=Distance.COSINE),
)
vector_store = QdrantVectorStore(
client=self.qdrant_client,
collection_name="siel-ai-user",
embedding=OpenAIEmbeddings(),
)
documents = []
meta_data = os.path.basename(path)
docs,identifier = self.getdocs(meta_data,path)
documents+=docs
# uuid4 is used to generate unique id number of documents to use that particular doc alone as context.
ids = [str(uuid4())]*len(documents)
vector_store.add_documents(documents=documents, ids=ids)
return identifier
class Speech_Text():
def __init__(self):
self.client = Groq(api_key=os.getenv("GROQ_API_KEY"))
self.deepgram = DeepgramClient(os.environ.get("VOICE_API_KEY"))
self.options = SpeakOptions(
model="aura-luna-en",
)
# Function to get transcript from audio
def get_transcript(self,audio):
audio_buffer = io.BytesIO()
sf.write(audio_buffer, audio[1], samplerate=audio[0], format="MP3")
audio_buffer.seek(0)
translation = self.client.audio.transcriptions.create(
file=("audio.mp3", audio_buffer.read()),
model="distil-whisper-large-v3-en",
response_format="json",
temperature=0.0,
)
return translation.text
# Function for speech synthesis
def speech_synthesis(self,text: str):
TEXT = {"text": text}
FILENAME = "audio.mp3"
try:
self.deepgram.speak.v("1").save(FILENAME, TEXT, self.options)
with open(FILENAME, "rb") as audio_file:
audio_data = audio_file.read()
return audio_data
except Exception as e:
print(f"Exception: {e}")
return None