import os
import re
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import UnstructuredWordDocumentLoader as DocxLoader
from fastapi.middleware.cors import CORSMiddleware
from fastapi import FastAPI
from pydantic import BaseModel
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import nltk
import time
# Set writable paths for cache and data
cache_dir = '/tmp'
writable_dir = os.path.join(cache_dir, 'vectors_db')
nltk_data_path = os.path.join(cache_dir, 'nltk_data')
# Configure NLTK and other library paths
os.environ['TRANSFORMERS_CACHE'] = os.path.join(cache_dir, 'transformers_cache')
os.environ['HF_HOME'] = os.path.join(cache_dir, 'huggingface')
os.environ['XDG_CACHE_HOME'] = cache_dir
# Add NLTK data path
# Ensure the directories exist
os.makedirs(nltk_data_path, exist_ok=True)
os.makedirs(writable_dir, exist_ok=True)
# Download required NLTK resources
nltk.download('punkt', download_dir=nltk_data_path)
def clean_response(response):
# Remove any leading/trailing whitespace, including newlines
cleaned = response.strip()
# Remove any enclosing quotation marks
cleaned = re.sub(r'^["\']+|["\']+$', '', cleaned)
# Replace multiple newlines with a single newline
cleaned = re.sub(r'\n+', '\n', cleaned)
# Remove any remaining '\n' characters
cleaned = cleaned.replace('\\n', '')
return cleaned
app = FastAPI()
openai_api_key = os.environ.get('OPENAI_API_KEY')
llm = ChatOpenAI(
def read_root():
return {"Hello": "World"}
class Query(BaseModel):
query_text: str
prompt = ChatPromptTemplate.from_template(
You are a helpful assistant designed specifically for the Thapar Institute of Engineering and Technology (TIET), a renowned technical college. Your task is to answer all queries related to TIET in a concise manner. Every response you provide should be relevant to the context of TIET. If a question falls outside of this context, please decline by stating, 'Sorry, I cannot help with that.' If you do not know the answer to a question, do not attempt to fabricate a response; instead, politely decline.
If the query is not related to TIET or falls outside the context of education, respond with:
"Sorry, I cannot help with that. I'm specifically designed to answer questions about the Thapar Institute of Engineering and Technology.
For more information, please contact our toll-free number: 18002024100 or E-mail us at admissions@thapar.edu
Question: {input}
def vector_embedding():
file_path = "./data/Data.docx"
if not os.path.exists(file_path):
print(f"The file {file_path} does not exist.")
return {"response": "Error: Data file not found"}
loader = DocxLoader(file_path)
documents = loader.load()
print(f"Loaded document: {file_path}")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks.")
model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True}
model_norm = HuggingFaceBgeEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
# Save FAISS vector store to a writable directory
db = FAISS.from_documents(chunks, model_norm)
print(f"Vector store created and saved successfully to {writable_dir}.")
return {"response": "Vector Store DB Is Ready"}
except Exception as e:
print(f"An error occurred: {str(e)}")
return {"response": f"Error: {str(e)}"}
def get_embeddings():
model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True}
model_norm = HuggingFaceBgeEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
return model_norm
def read_item(query: Query):
embeddings = get_embeddings()
vectors = FAISS.load_local(writable_dir, embeddings, allow_dangerous_deserialization=True)
except Exception as e:
print(f"Error loading vector store: {str(e)}")
return {"response": "Vector Store Not Found or Error Loading. Please run /setup first."}
prompt1 = query.query_text
if prompt1:
start = time.process_time()
document_chain = create_stuff_documents_chain(llm, prompt)
retriever = vectors.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)
response = retrieval_chain.invoke({'input': prompt1})
print("Response time:", time.process_time() - start)
# Apply the cleaning function to the response
cleaned_response = clean_response(response['answer'])
print("Cleaned response:", repr(cleaned_response))
return {"response": cleaned_response}
return {"response": "No Query Found"}
def setup():
return vector_embedding()
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="", port=7860)