Spaces:

edithram23
/

Chatbot

Runtime error

File size: 11,308 Bytes

from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from retriever import Retriever
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
import os
import io
from langchain_qdrant import QdrantVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from openai import OpenAI
from groq import Groq
import soundfile as sf
from deepgram import DeepgramClient, SpeakOptions
from langchain_groq import ChatGroq
import hashlib
import time
from uuid import uuid4
from dotenv import load_dotenv

load_dotenv('.env')

class Script():
    def __init__(self):
        self.retriever = Retriever()
        self.openai_client = ChatOpenAI(model="gpt-4o-mini",temperature=0.1)
        self.groq = ChatGroq(model='llama3-70b-8192')
        self.groq1 = ChatGroq(model='llama3-8b-8192')	
        

    def format_docs(self,format_results,id=False):
        formatted_docs = []
        for i,doc in enumerate(format_results,start=1):
            if(id==True):
                metadata = doc.metadata['DOCUMENT_NAME']
            else:
                metadata = doc.metadata['DOCUMENT_IS_ABOUT']
            page = doc.page_content.strip()
            content = f"**DOC {i}. METADATA : This DOC is about {metadata} \n CONTENT:{page}**"
            formatted_docs.append(content)
        return "".join(formatted_docs)
    
    def history(self,hist):
        text = ''
        for i in hist:
            if(i['content']!='Sorry! Unable to find an answer for your question. Try Again.'):
                text += '|Role:'+i['role']+'Content:'+i['content']+'|'
    
    def gpt_loaders(self,query:str,history:str):
        template= f"""
                    # You are an excellent Question & Answering BOT based on Context.
                    # TASK : Given a question and the context, you are required to answer the question..
                    # User questions may be given as a user_query (or) User_question (or) User_scenario.
                    ===============================
                    #USER_QUERY :  {{question}}
                    ===============================
                    #METADATA_OF_CONTEXT :
                    -> The context given is related to INDIAN-TAXATION.
                    #CONTEXT : {{context}}
                    ===============================
                    You are also given previous ChatHistories (User question and corresponding AI answer) to you as extra data.
                    --# When to take the history as CONTEXT: Only if the history is relevant to the current question, you are permitted to take the chat history as a context.
                    --# If it is not relevant to the current question, do not take it.
                    #Chat History : {{history}}
                    ===============================
                    -> Don't provide your own answer that is not in the given context.
                    -> If you can provide a similar answer from the context that may be relevant but not exactly correct for the question, you can provide that answer.
                    -> Try to provide a proper output for the question. Don't explain any questions too lengthy max[100 words].
                    ===============================
                    # OUTPUT FORMAT:
                    -> Your output may be given to a voice model for a speech output. Try to be precise with your words. At the same time, fill the user with your answer.
                    -> Don't provide any further explanation apart from the answer output.
                    # STEP 1 : Generate a output for the query from the context:
                    # STEP 2 : -> Based on the current output check if it is relevant to the question again.
                               -> If you are not 100% able to answer the given question from the context => PROVIDE "Sorry! Unable to find an answer for your question. Try Again."

                """
        # template = f"""ANSWER THE USER QUESTION BASED ON THE GIVEN CONTEXT ALONE.
        #     UESR QUESTION : {{question}}
        #             CONTEXT : {{context}}
        #             {{history}}
        # """
        rag_prompt = PromptTemplate.from_template(template)
        rag_chain = (
                    rag_prompt
                    | self.openai_client
                    | StrOutputParser()
                    )
        question ={"context": self.format_docs(self.retriever.multiple_contexts(query)), "question": query, "history": history}
        return rag_chain,question
    
    def gpt_loaders_id(self,query:str,history:str,id:str):
        template= f"""
                    # You are an excellent Question & Answering BOT. Given a question and the context you will answer the question only based on the given context.
                    # You will be given a user_query (or) User_question (or) User_scenario.
                    # TASK: Your task is to provide an Answer to the USER_QUERY with the given CONTEXT_DATA.
                    ===============================
                    #USER_QUERY :  {{question}}
                    ===============================
                    #METADATA_OF_CONTEXT : -> The context given is a given from the user pdf input.
                                        -> Based on the user_query use the context accordingly.
                    #CONTEXT : {{context}}
                    ===============================
                    You are also given previous ChatHistories (User question and corressponding AI answer) to you as an extra data.
                    --# When to take the history as CONTEXT : Only if the history is relevant to the current question you are permitted to take the chat history as a context.
                    --# If it is not relevant to the current question do not take it.
                    #Chat History : {{history}}
                    ===============================
                    -> You are allowed to provide the answer only from the given context.
                    -> Don't provide your own answer that is not in the given context.
                    -> If you are not able to answer the given question from the context => PROVIDE "Sorry! Unable to find an answer for your question. Try Again."
                    -> Try to be a precise and provide a proper output for the question. Don't explain any questions too lengthy max[100 words].
                    -> Provide answer only to the question that is asked.
                    ===============================
                    # OUTPUT FORMAT:
                        -> Your output may be given to a voice model for a speech output. Try to be precise with your words. At the same time, fill the user with your answer
                        -> Don't provide any etc explanation apart from the answer output.
                """
        rag_prompt = PromptTemplate.from_template(template)
        rag_chain = (
                    rag_prompt
                    | self.groq
                    | StrOutputParser()
                    )
        question ={"context": self.format_docs(self.retriever.id_filter(query,id),id=True), "question": query, "history": history}
        return rag_chain,question

class Vector_db():
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
                            chunk_size=1024,
                            chunk_overlap=256,
                            length_function=len,
                            is_separator_regex=False,
                            )
        self.qdrant_client = QdrantClient(
                        url=os.getenv("QDRANT_URL"),
                        api_key=os.getenv("QDRANT_API_KEY")
                    )
        self.openai_client = OpenAI()

    def get_embed(self, texts):
        return self.openai_client.embeddings.create(input = texts, model="text-embedding-3-large").data[0].embedding
    
    def text_split(self,full_text,meta):
        documents = self.text_splitter.create_documents([full_text],metadatas=[meta])
        return documents

    def load_data(self,pdf_path:str):
        loader = PyPDFLoader(pdf_path)
        file = loader.load()
        text = ''
        for i in file:
            text+=i.page_content
        return text
    
    def getdocs(self,about,filename):
        text = self.load_data(filename)
        data = (text+str(time.time())).encode('utf-8')
        identifier = hashlib.sha256(data).hexdigest()
        metadata = {'DOCUMENT_NAME':about,'ID':str(identifier)}
        documents = self.text_split(text,metadata)
        return documents,identifier
    
    def upload_pdfs_user(self,path,delete=False):
        if delete==True:
            if(self.qdrant_client.collection_exists("siel-ai-user")):
                    self.qdrant_client.delete_collection("siel-ai-user")
        if(not(self.qdrant_client.collection_exists("siel-ai-user"))):
            self.qdrant_client.create_collection(
                            collection_name="siel-ai-user",
                            vectors_config=VectorParams(size=1536,
                                                        distance=Distance.COSINE),
                                            )
        vector_store = QdrantVectorStore(
                        client=self.qdrant_client,
                        collection_name="siel-ai-user",
                        embedding=OpenAIEmbeddings(),
                                        )
        documents = []
        meta_data = os.path.basename(path)
        docs,identifier = self.getdocs(meta_data,path)
        documents+=docs
        # uuid4 is used to generate unique id number of documents to use that particular doc alone as context.
        ids = [str(uuid4())]*len(documents)
        vector_store.add_documents(documents=documents, ids=ids)
        return identifier

class Speech_Text():
    def __init__(self):
        self.client = Groq(api_key=os.getenv("GROQ_API_KEY"))
        self.deepgram = DeepgramClient(os.environ.get("VOICE_API_KEY"))
        self.options = SpeakOptions(
            model="aura-luna-en",
        )
        
    # Function to get transcript from audio
    def get_transcript(self,audio):
        audio_buffer = io.BytesIO()
        sf.write(audio_buffer, audio[1], samplerate=audio[0], format="MP3")
        audio_buffer.seek(0)
        translation = self.client.audio.transcriptions.create(
            file=("audio.mp3", audio_buffer.read()),
            model="distil-whisper-large-v3-en",
            response_format="json",
            temperature=0.0,
        )
        
        return translation.text

    # Function for speech synthesis
    def speech_synthesis(self,text: str):
        TEXT = {"text": text}
        FILENAME = "audio.mp3"
        try:
            self.deepgram.speak.v("1").save(FILENAME, TEXT, self.options)
            with open(FILENAME, "rb") as audio_file:
                audio_data = audio_file.read()
            return audio_data
        except Exception as e:
            print(f"Exception: {e}")
            return None