Spaces:

edithram23
/

Chatbot

Runtime error

App Files Files Community

Chatbot / setup.py

edithram23

initial commit

e900f80 5 months ago

raw

history blame

11.3 kB

	from langchain_core.prompts import PromptTemplate
	from langchain_openai import ChatOpenAI
	from langchain_core.output_parsers import StrOutputParser
	from retriever import Retriever
	from qdrant_client import QdrantClient
	from qdrant_client.http.models import Distance, VectorParams
	import os
	import io
	from langchain_qdrant import QdrantVectorStore
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.document_loaders import PyPDFLoader
	from openai import OpenAI
	from groq import Groq
	import soundfile as sf
	from deepgram import DeepgramClient, SpeakOptions
	from langchain_groq import ChatGroq
	import hashlib
	import time
	from uuid import uuid4
	from dotenv import load_dotenv

	load_dotenv('.env')

	class Script():
	def __init__(self):
	self.retriever = Retriever()
	self.openai_client = ChatOpenAI(model="gpt-4o-mini",temperature=0.1)
	self.groq = ChatGroq(model='llama3-70b-8192')
	self.groq1 = ChatGroq(model='llama3-8b-8192')


	def format_docs(self,format_results,id=False):
	formatted_docs = []
	for i,doc in enumerate(format_results,start=1):
	if(id==True):
	metadata = doc.metadata['DOCUMENT_NAME']
	else:
	metadata = doc.metadata['DOCUMENT_IS_ABOUT']
	page = doc.page_content.strip()
	content = f"DOC {i}. METADATA : This DOC is about {metadata} \n CONTENT:{page}"
	formatted_docs.append(content)
	return "".join(formatted_docs)

	def history(self,hist):
	text = ''
	for i in hist:
	if(i['content']!='Sorry! Unable to find an answer for your question. Try Again.'):
	text += '\|Role:'+i['role']+'Content:'+i['content']+'\|'

	def gpt_loaders(self,query:str,history:str):
	template= f"""
	# You are an excellent Question & Answering BOT based on Context.
	# TASK : Given a question and the context, you are required to answer the question..
	# User questions may be given as a user_query (or) User_question (or) User_scenario.
	===============================
	#USER_QUERY : {{question}}
	===============================
	#METADATA_OF_CONTEXT :
	-> The context given is related to INDIAN-TAXATION.
	#CONTEXT : {{context}}
	===============================
	You are also given previous ChatHistories (User question and corresponding AI answer) to you as extra data.
	--# When to take the history as CONTEXT: Only if the history is relevant to the current question, you are permitted to take the chat history as a context.
	--# If it is not relevant to the current question, do not take it.
	#Chat History : {{history}}
	===============================
	-> Don't provide your own answer that is not in the given context.
	-> If you can provide a similar answer from the context that may be relevant but not exactly correct for the question, you can provide that answer.
	-> Try to provide a proper output for the question. Don't explain any questions too lengthy max[100 words].
	===============================
	# OUTPUT FORMAT:
	-> Your output may be given to a voice model for a speech output. Try to be precise with your words. At the same time, fill the user with your answer.
	-> Don't provide any further explanation apart from the answer output.
	# STEP 1 : Generate a output for the query from the context:
	# STEP 2 : -> Based on the current output check if it is relevant to the question again.
	-> If you are not 100% able to answer the given question from the context => PROVIDE "Sorry! Unable to find an answer for your question. Try Again."

	"""
	# template = f"""ANSWER THE USER QUESTION BASED ON THE GIVEN CONTEXT ALONE.
	# UESR QUESTION : {{question}}
	# CONTEXT : {{context}}
	# {{history}}
	# """
	rag_prompt = PromptTemplate.from_template(template)
	rag_chain = (
	rag_prompt
	\| self.openai_client
	\| StrOutputParser()
	)
	question ={"context": self.format_docs(self.retriever.multiple_contexts(query)), "question": query, "history": history}
	return rag_chain,question

	def gpt_loaders_id(self,query:str,history:str,id:str):
	template= f"""
	# You are an excellent Question & Answering BOT. Given a question and the context you will answer the question only based on the given context.
	# You will be given a user_query (or) User_question (or) User_scenario.
	# TASK: Your task is to provide an Answer to the USER_QUERY with the given CONTEXT_DATA.
	===============================
	#USER_QUERY : {{question}}
	===============================
	#METADATA_OF_CONTEXT : -> The context given is a given from the user pdf input.
	-> Based on the user_query use the context accordingly.
	#CONTEXT : {{context}}
	===============================
	You are also given previous ChatHistories (User question and corressponding AI answer) to you as an extra data.
	--# When to take the history as CONTEXT : Only if the history is relevant to the current question you are permitted to take the chat history as a context.
	--# If it is not relevant to the current question do not take it.
	#Chat History : {{history}}
	===============================
	-> You are allowed to provide the answer only from the given context.
	-> Don't provide your own answer that is not in the given context.
	-> If you are not able to answer the given question from the context => PROVIDE "Sorry! Unable to find an answer for your question. Try Again."
	-> Try to be a precise and provide a proper output for the question. Don't explain any questions too lengthy max[100 words].
	-> Provide answer only to the question that is asked.
	===============================
	# OUTPUT FORMAT:
	-> Your output may be given to a voice model for a speech output. Try to be precise with your words. At the same time, fill the user with your answer
	-> Don't provide any etc explanation apart from the answer output.
	"""
	rag_prompt = PromptTemplate.from_template(template)
	rag_chain = (
	rag_prompt
	\| self.groq
	\| StrOutputParser()
	)
	question ={"context": self.format_docs(self.retriever.id_filter(query,id),id=True), "question": query, "history": history}
	return rag_chain,question

	class Vector_db():
	def __init__(self):
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1024,
	chunk_overlap=256,
	length_function=len,
	is_separator_regex=False,
	)
	self.qdrant_client = QdrantClient(
	url=os.getenv("QDRANT_URL"),
	api_key=os.getenv("QDRANT_API_KEY")
	)
	self.openai_client = OpenAI()

	def get_embed(self, texts):
	return self.openai_client.embeddings.create(input = texts, model="text-embedding-3-large").data[0].embedding

	def text_split(self,full_text,meta):
	documents = self.text_splitter.create_documents([full_text],metadatas=[meta])
	return documents

	def load_data(self,pdf_path:str):
	loader = PyPDFLoader(pdf_path)
	file = loader.load()
	text = ''
	for i in file:
	text+=i.page_content
	return text

	def getdocs(self,about,filename):
	text = self.load_data(filename)
	data = (text+str(time.time())).encode('utf-8')
	identifier = hashlib.sha256(data).hexdigest()
	metadata = {'DOCUMENT_NAME':about,'ID':str(identifier)}
	documents = self.text_split(text,metadata)
	return documents,identifier

	def upload_pdfs_user(self,path,delete=False):
	if delete==True:
	if(self.qdrant_client.collection_exists("siel-ai-user")):
	self.qdrant_client.delete_collection("siel-ai-user")
	if(not(self.qdrant_client.collection_exists("siel-ai-user"))):
	self.qdrant_client.create_collection(
	collection_name="siel-ai-user",
	vectors_config=VectorParams(size=1536,
	distance=Distance.COSINE),
	)
	vector_store = QdrantVectorStore(
	client=self.qdrant_client,
	collection_name="siel-ai-user",
	embedding=OpenAIEmbeddings(),
	)
	documents = []
	meta_data = os.path.basename(path)
	docs,identifier = self.getdocs(meta_data,path)
	documents+=docs
	# uuid4 is used to generate unique id number of documents to use that particular doc alone as context.
	ids = [str(uuid4())]*len(documents)
	vector_store.add_documents(documents=documents, ids=ids)
	return identifier

	class Speech_Text():
	def __init__(self):
	self.client = Groq(api_key=os.getenv("GROQ_API_KEY"))
	self.deepgram = DeepgramClient(os.environ.get("VOICE_API_KEY"))
	self.options = SpeakOptions(
	model="aura-luna-en",
	)

	# Function to get transcript from audio
	def get_transcript(self,audio):
	audio_buffer = io.BytesIO()
	sf.write(audio_buffer, audio[1], samplerate=audio[0], format="MP3")
	audio_buffer.seek(0)
	translation = self.client.audio.transcriptions.create(
	file=("audio.mp3", audio_buffer.read()),
	model="distil-whisper-large-v3-en",
	response_format="json",
	temperature=0.0,
	)

	return translation.text

	# Function for speech synthesis
	def speech_synthesis(self,text: str):
	TEXT = {"text": text}
	FILENAME = "audio.mp3"
	try:
	self.deepgram.speak.v("1").save(FILENAME, TEXT, self.options)
	with open(FILENAME, "rb") as audio_file:
	audio_data = audio_file.read()
	return audio_data
	except Exception as e:
	print(f"Exception: {e}")
	return None