Spaces:
Runtime error
Runtime error
File size: 11,308 Bytes
7661630 e900f80 7661630 e900f80 7661630 e900f80 7661630 6c077f9 7661630 6c077f9 7661630 e900f80 7661630 6c077f9 e900f80 7661630 e900f80 7661630 e900f80 7661630 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from retriever import Retriever
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
import os
import io
from langchain_qdrant import QdrantVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from openai import OpenAI
from groq import Groq
import soundfile as sf
from deepgram import DeepgramClient, SpeakOptions
from langchain_groq import ChatGroq
import hashlib
import time
from uuid import uuid4
from dotenv import load_dotenv
load_dotenv('.env')
class Script():
def __init__(self):
self.retriever = Retriever()
self.openai_client = ChatOpenAI(model="gpt-4o-mini",temperature=0.1)
self.groq = ChatGroq(model='llama3-70b-8192')
self.groq1 = ChatGroq(model='llama3-8b-8192')
def format_docs(self,format_results,id=False):
formatted_docs = []
for i,doc in enumerate(format_results,start=1):
if(id==True):
metadata = doc.metadata['DOCUMENT_NAME']
else:
metadata = doc.metadata['DOCUMENT_IS_ABOUT']
page = doc.page_content.strip()
content = f"**DOC {i}. METADATA : This DOC is about {metadata} \n CONTENT:{page}**"
formatted_docs.append(content)
return "".join(formatted_docs)
def history(self,hist):
text = ''
for i in hist:
if(i['content']!='Sorry! Unable to find an answer for your question. Try Again.'):
text += '|Role:'+i['role']+'Content:'+i['content']+'|'
def gpt_loaders(self,query:str,history:str):
template= f"""
# You are an excellent Question & Answering BOT based on Context.
# TASK : Given a question and the context, you are required to answer the question..
# User questions may be given as a user_query (or) User_question (or) User_scenario.
===============================
#USER_QUERY : {{question}}
===============================
#METADATA_OF_CONTEXT :
-> The context given is related to INDIAN-TAXATION.
#CONTEXT : {{context}}
===============================
You are also given previous ChatHistories (User question and corresponding AI answer) to you as extra data.
--# When to take the history as CONTEXT: Only if the history is relevant to the current question, you are permitted to take the chat history as a context.
--# If it is not relevant to the current question, do not take it.
#Chat History : {{history}}
===============================
-> Don't provide your own answer that is not in the given context.
-> If you can provide a similar answer from the context that may be relevant but not exactly correct for the question, you can provide that answer.
-> Try to provide a proper output for the question. Don't explain any questions too lengthy max[100 words].
===============================
# OUTPUT FORMAT:
-> Your output may be given to a voice model for a speech output. Try to be precise with your words. At the same time, fill the user with your answer.
-> Don't provide any further explanation apart from the answer output.
# STEP 1 : Generate a output for the query from the context:
# STEP 2 : -> Based on the current output check if it is relevant to the question again.
-> If you are not 100% able to answer the given question from the context => PROVIDE "Sorry! Unable to find an answer for your question. Try Again."
"""
# template = f"""ANSWER THE USER QUESTION BASED ON THE GIVEN CONTEXT ALONE.
# UESR QUESTION : {{question}}
# CONTEXT : {{context}}
# {{history}}
# """
rag_prompt = PromptTemplate.from_template(template)
rag_chain = (
rag_prompt
| self.openai_client
| StrOutputParser()
)
question ={"context": self.format_docs(self.retriever.multiple_contexts(query)), "question": query, "history": history}
return rag_chain,question
def gpt_loaders_id(self,query:str,history:str,id:str):
template= f"""
# You are an excellent Question & Answering BOT. Given a question and the context you will answer the question only based on the given context.
# You will be given a user_query (or) User_question (or) User_scenario.
# TASK: Your task is to provide an Answer to the USER_QUERY with the given CONTEXT_DATA.
===============================
#USER_QUERY : {{question}}
===============================
#METADATA_OF_CONTEXT : -> The context given is a given from the user pdf input.
-> Based on the user_query use the context accordingly.
#CONTEXT : {{context}}
===============================
You are also given previous ChatHistories (User question and corressponding AI answer) to you as an extra data.
--# When to take the history as CONTEXT : Only if the history is relevant to the current question you are permitted to take the chat history as a context.
--# If it is not relevant to the current question do not take it.
#Chat History : {{history}}
===============================
-> You are allowed to provide the answer only from the given context.
-> Don't provide your own answer that is not in the given context.
-> If you are not able to answer the given question from the context => PROVIDE "Sorry! Unable to find an answer for your question. Try Again."
-> Try to be a precise and provide a proper output for the question. Don't explain any questions too lengthy max[100 words].
-> Provide answer only to the question that is asked.
===============================
# OUTPUT FORMAT:
-> Your output may be given to a voice model for a speech output. Try to be precise with your words. At the same time, fill the user with your answer
-> Don't provide any etc explanation apart from the answer output.
"""
rag_prompt = PromptTemplate.from_template(template)
rag_chain = (
rag_prompt
| self.groq
| StrOutputParser()
)
question ={"context": self.format_docs(self.retriever.id_filter(query,id),id=True), "question": query, "history": history}
return rag_chain,question
class Vector_db():
def __init__(self):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1024,
chunk_overlap=256,
length_function=len,
is_separator_regex=False,
)
self.qdrant_client = QdrantClient(
url=os.getenv("QDRANT_URL"),
api_key=os.getenv("QDRANT_API_KEY")
)
self.openai_client = OpenAI()
def get_embed(self, texts):
return self.openai_client.embeddings.create(input = texts, model="text-embedding-3-large").data[0].embedding
def text_split(self,full_text,meta):
documents = self.text_splitter.create_documents([full_text],metadatas=[meta])
return documents
def load_data(self,pdf_path:str):
loader = PyPDFLoader(pdf_path)
file = loader.load()
text = ''
for i in file:
text+=i.page_content
return text
def getdocs(self,about,filename):
text = self.load_data(filename)
data = (text+str(time.time())).encode('utf-8')
identifier = hashlib.sha256(data).hexdigest()
metadata = {'DOCUMENT_NAME':about,'ID':str(identifier)}
documents = self.text_split(text,metadata)
return documents,identifier
def upload_pdfs_user(self,path,delete=False):
if delete==True:
if(self.qdrant_client.collection_exists("siel-ai-user")):
self.qdrant_client.delete_collection("siel-ai-user")
if(not(self.qdrant_client.collection_exists("siel-ai-user"))):
self.qdrant_client.create_collection(
collection_name="siel-ai-user",
vectors_config=VectorParams(size=1536,
distance=Distance.COSINE),
)
vector_store = QdrantVectorStore(
client=self.qdrant_client,
collection_name="siel-ai-user",
embedding=OpenAIEmbeddings(),
)
documents = []
meta_data = os.path.basename(path)
docs,identifier = self.getdocs(meta_data,path)
documents+=docs
# uuid4 is used to generate unique id number of documents to use that particular doc alone as context.
ids = [str(uuid4())]*len(documents)
vector_store.add_documents(documents=documents, ids=ids)
return identifier
class Speech_Text():
def __init__(self):
self.client = Groq(api_key=os.getenv("GROQ_API_KEY"))
self.deepgram = DeepgramClient(os.environ.get("VOICE_API_KEY"))
self.options = SpeakOptions(
model="aura-luna-en",
)
# Function to get transcript from audio
def get_transcript(self,audio):
audio_buffer = io.BytesIO()
sf.write(audio_buffer, audio[1], samplerate=audio[0], format="MP3")
audio_buffer.seek(0)
translation = self.client.audio.transcriptions.create(
file=("audio.mp3", audio_buffer.read()),
model="distil-whisper-large-v3-en",
response_format="json",
temperature=0.0,
)
return translation.text
# Function for speech synthesis
def speech_synthesis(self,text: str):
TEXT = {"text": text}
FILENAME = "audio.mp3"
try:
self.deepgram.speak.v("1").save(FILENAME, TEXT, self.options)
with open(FILENAME, "rb") as audio_file:
audio_data = audio_file.read()
return audio_data
except Exception as e:
print(f"Exception: {e}")
return None
|