Spaces:

0504ankitsharma
/

thapargpt_openai

Sleeping

App Files Files Community

0504ankitsharma commited on Nov 27, 2024

Commit

36a19d8

verified ·

1 Parent(s): 470d648

updated

Browse files

Files changed (1) hide show

app/main.py +93 -116

app/main.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import os
 import re
-from openai import OpenAI
 from langchain_openai import ChatOpenAI
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.combine_documents import create_stuff_documents_chain
@@ -8,63 +11,24 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain.chains import create_retrieval_chain
 from langchain_community.vectorstores import FAISS
 from langchain_community.document_loaders import UnstructuredWordDocumentLoader as DocxLoader
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi import FastAPI
-from pydantic import BaseModel
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
-import nltk  # Importing NLTK
-import time
-import os
-import nltk
-# Set writable paths for cache and data
-cache_dir = '/tmp'
-nltk_data_path = os.path.join(cache_dir, 'nltk_data')
-# Configure NLTK and other library paths
-os.environ['TRANSFORMERS_CACHE'] = os.path.join(cache_dir, 'transformers_cache')
-os.environ['HF_HOME'] = os.path.join(cache_dir, 'huggingface')
-os.environ['XDG_CACHE_HOME'] = cache_dir
-# Add NLTK data path
-nltk.data.path.append(nltk_data_path)
-# Ensure the directory exists
-try:
-    os.makedirs(nltk_data_path, exist_ok=True)
-except OSError as e:
-    print(f"Error creating directory {nltk_data_path}: {e}")
-    raise
-# Download required NLTK resources
-try:
-    nltk.download('punkt', download_dir=nltk_data_path)
-    print("NLTK 'punkt' resource downloaded successfully.")
-except Exception as e:
-    print(f"Error downloading NLTK resources: {e}")
-    raise
 def clean_response(response):
-    # Remove any leading/trailing whitespace, including newlines
     cleaned = response.strip()
-    # Remove any enclosing quotation marks
     cleaned = re.sub(r'^["\']+|["\']+$', '', cleaned)
-    # Replace multiple newlines with a single newline
     cleaned = re.sub(r'\n+', '\n', cleaned)
-    # Remove any remaining '\n' characters
     cleaned = cleaned.replace('\\n', '')
     return cleaned
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -73,103 +37,116 @@ app.add_middleware(
     allow_headers=["*"],
 )
-openai_api_key = os.environ.get('OPENAI_API_KEY')
 llm = ChatOpenAI(
     api_key=openai_api_key,
-    model_name="gpt-4-turbo-preview",  # or "gpt-3.5-turbo" for a more economical option
-    temperature=0.7
 )
 @app.get("/")
 def read_root():
-    return {"Hello": "World"}
 class Query(BaseModel):
     query_text: str
-prompt = ChatPromptTemplate.from_template(
-"""
-You are a helpful assistant designed specifically for the Thapar Institute of Engineering and Technology (TIET), a renowned technical college. Your task is to answer all queries related to TIET. Every response you provide should be relevant to the context of TIET. If a question falls outside of this context, please decline by stating, 'Sorry, I cannot help with that.' If you do not know the answer to a question, do not attempt to fabricate a response; instead, politely decline.
-You may elaborate on your answers slightly to provide more information, but avoid sounding boastful or exaggerating. Stay focused on the context provided.
-If the query is not related to TIET or falls outside the context of education, respond with:
-        "Sorry, I cannot help with that. I'm specifically designed to answer questions about the Thapar Institute of Engineering and Technology.
-        For more information, please contact at our toll-free number: 18002024100 or E-mail us at admissions@thapar.edu
-<context>
-{context}
-</context>
-Question: {input}
-"""
-)
 def vector_embedding():
     try:
-        file_path = "./data/Data.docx"
-        if not os.path.exists(file_path):
-            print(f"The file {file_path} does not exist.")
-            return {"response": "Error: Data file not found"}
-        loader = DocxLoader(file_path)
         documents = loader.load()
-        print(f"Loaded document: {file_path}")
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
         chunks = text_splitter.split_documents(documents)
         print(f"Created {len(chunks)} chunks.")
-        model_name = "BAAI/bge-base-en"
-        encode_kwargs = {'normalize_embeddings': True}
-        model_norm = HuggingFaceBgeEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
-        db = FAISS.from_documents(chunks, model_norm)
-        db.save_local("./vectors_db")
         print("Vector store created and saved successfully.")
-        return {"response": "Vector Store DB Is Ready"}
     except Exception as e:
-        print(f"An error occurred: {str(e)}")
-        return {"response": f"Error: {str(e)}"}
 def get_embeddings():
-    model_name = "BAAI/bge-base-en"
     encode_kwargs = {'normalize_embeddings': True}
-    model_norm = HuggingFaceBgeEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
-    return model_norm
-@app.post("/chat")  # Changed from /anthropic to /chat
-def read_item(query: Query):
-    try:
-        embeddings = get_embeddings()
-        vectors = FAISS.load_local("./vectors_db", embeddings, allow_dangerous_deserialization=True)
-    except Exception as e:
-        print(f"Error loading vector store: {str(e)}")
-        return {"response": "Vector Store Not Found or Error Loading. Please run /setup first."}
-    prompt1 = query.query_text
-    if prompt1:
-        start = time.process_time()
-        document_chain = create_stuff_documents_chain(llm, prompt)
-        retriever = vectors.as_retriever()
-        retrieval_chain = create_retrieval_chain(retriever, document_chain)
-        response = retrieval_chain.invoke({'input': prompt1})
-        print("Response time:", time.process_time() - start)
-        # Apply the cleaning function to the response
-        cleaned_response = clean_response(response['answer'])
-        # For debugging, print the cleaned response
-        print("Cleaned response:", repr(cleaned_response))
-        return cleaned_response
-    else:
-        return "No Query Found"
-@app.get("/setup")
-def setup():
-    return vector_embedding()
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)

 import os
 import re
+import time
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
 from langchain_openai import ChatOpenAI
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.chains import create_retrieval_chain
 from langchain_community.vectorstores import FAISS
 from langchain_community.document_loaders import UnstructuredWordDocumentLoader as DocxLoader
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+# Utility function to clean the response
 def clean_response(response):
+    if not response:
+        return "Sorry, I couldn't generate a response."
     cleaned = response.strip()
     cleaned = re.sub(r'^["\']+|["\']+$', '', cleaned)
     cleaned = re.sub(r'\n+', '\n', cleaned)
     cleaned = cleaned.replace('\\n', '')
     return cleaned
+# Initialize FastAPI app
 app = FastAPI()
+# CORS Middleware setup
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Global Variables
+openai_api_key = os.getenv('OPENAI_API_KEY')  # Ensure this is set in your environment
+VECTOR_DB_PATH = "./vectors_db"
+DATA_FILE_PATH = "./data/Data.docx"
+MODEL_NAME = "BAAI/bge-base-en"
+# Initialize OpenAI LLM
 llm = ChatOpenAI(
     api_key=openai_api_key,
+    model_name="gpt-4-turbo-preview",  # Use "gpt-3.5-turbo" for cost efficiency if required
+    temperature=0.7,
+)
+# Prompt template
+prompt = ChatPromptTemplate.from_template(
+    """
+    You are a helpful assistant designed specifically for the Thapar Institute of Engineering and Technology (TIET), a renowned technical college. Your task is to answer all queries related to TIET. Every response you provide should be relevant to the context of TIET. If a question falls outside of this context, please decline by stating, 'Sorry, I cannot help with that.' If you do not know the answer to a question, do not attempt to fabricate a response; instead, politely decline.
+    If the query is not related to TIET or falls outside the context of education, respond with:
+            "Sorry, I cannot help with that. I'm specifically designed to answer questions about the Thapar Institute of Engineering and Technology.
+            For more information, please contact at our toll-free number: 18002024100 or E-mail us at admissions@thapar.edu
+    <context>
+    {context}
+    </context>
+    Question: {input}
+    """
 )
+# Route: Home
 @app.get("/")
 def read_root():
+    return {"message": "Welcome to the ThaparGPT API!"}
+# Route: Chat Endpoint
 class Query(BaseModel):
     query_text: str
+@app.post("/chat")
+def chat(query: Query):
+    try:
+        # Load the vector store
+        embeddings = get_embeddings()
+        vectors = FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)
+    except Exception as e:
+        print(f"Error loading vector store: {str(e)}")
+        raise HTTPException(status_code=500, detail="Vector Store not found or loading failed. Please run /setup first.")
+    # Retrieve and process the query
+    query_text = query.query_text
+    if query_text:
+        start_time = time.process_time()
+        document_chain = create_stuff_documents_chain(llm, prompt)
+        retriever = vectors.as_retriever()
+        retrieval_chain = create_retrieval_chain(retriever, document_chain)
+        try:
+            response = retrieval_chain.invoke({'input': query_text})
+        except Exception as e:
+            print(f"Error during query processing: {str(e)}")
+            raise HTTPException(status_code=500, detail="Error processing the query.")
+        print("Response time:", time.process_time() - start_time)
+        cleaned_response = clean_response(response.get('answer', ''))
+        return {"response": cleaned_response}
+    else:
+        raise HTTPException(status_code=400, detail="No query found in the request.")
+# Route: Setup Endpoint
+@app.get("/setup")
+def setup():
+    return vector_embedding()
+# Utility: Create Vector Embeddings
 def vector_embedding():
     try:
+        if not os.path.exists(DATA_FILE_PATH):
+            print(f"The file {DATA_FILE_PATH} does not exist.")
+            raise HTTPException(status_code=404, detail="Data file not found.")
+        # Load and split document
+        loader = DocxLoader(DATA_FILE_PATH)
         documents = loader.load()
+        print(f"Loaded document: {DATA_FILE_PATH}")
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
         chunks = text_splitter.split_documents(documents)
         print(f"Created {len(chunks)} chunks.")
+        # Create vector store
+        embeddings = get_embeddings()
+        db = FAISS.from_documents(chunks, embeddings)
+        db.save_local(VECTOR_DB_PATH)
         print("Vector store created and saved successfully.")
+        return {"response": "Vector Store DB is ready."}
     except Exception as e:
+        print(f"Error during setup: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error during setup: {str(e)}")
+# Utility: Load Embedding Model
 def get_embeddings():
     encode_kwargs = {'normalize_embeddings': True}
+    return HuggingFaceBgeEmbeddings(model_name=MODEL_NAME, encode_kwargs=encode_kwargs)
+# Main entry point
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)