Spaces:
Sleeping
Sleeping
File size: 5,829 Bytes
fac79dc 3144702 fac79dc 3144702 8c44cf7 679a1e8 8c44cf7 3144702 1939861 8c44cf7 3144702 405e044 3144702 405e044 3144702 405e044 3144702 40efbe7 fac79dc de91770 e098a3b 3144702 fac79dc 405e044 de91770 3144702 fac79dc 8c44cf7 36a19d8 fac79dc 679a1e8 3144702 1939861 3144702 405e044 3144702 8c44cf7 36a19d8 3144702 e098a3b 3144702 405e044 3144702 405e044 3144702 405e044 3144702 1939861 3144702 8c44cf7 3144702 405e044 e098a3b 3144702 1939861 3144702 1939861 3144702 1939861 3144702 405e044 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import os
import re
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import UnstructuredWordDocumentLoader as DocxLoader
from fastapi.middleware.cors import CORSMiddleware
from fastapi import FastAPI
from pydantic import BaseModel
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import nltk
import time
# Set writable paths for cache and data
cache_dir = '/tmp'
writable_dir = os.path.join(cache_dir, 'vectors_db')
nltk_data_path = os.path.join(cache_dir, 'nltk_data')
# Configure NLTK and other library paths
os.environ['TRANSFORMERS_CACHE'] = os.path.join(cache_dir, 'transformers_cache')
os.environ['HF_HOME'] = os.path.join(cache_dir, 'huggingface')
os.environ['XDG_CACHE_HOME'] = cache_dir
# Add NLTK data path
nltk.data.path.append(nltk_data_path)
# Ensure the directories exist
os.makedirs(nltk_data_path, exist_ok=True)
os.makedirs(writable_dir, exist_ok=True)
# Download required NLTK resources
nltk.download('punkt', download_dir=nltk_data_path)
def clean_response(response):
# Remove any leading/trailing whitespace, including newlines
cleaned = response.strip()
# Remove any enclosing quotation marks
cleaned = re.sub(r'^["\']+|["\']+$', '', cleaned)
# Replace multiple newlines with a single newline
cleaned = re.sub(r'\n+', '\n', cleaned)
# Remove any remaining '\n' characters
cleaned = cleaned.replace('\\n', '')
return cleaned
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
openai_api_key = os.environ.get('OPENAI_API_KEY')
llm = ChatOpenAI(
api_key=openai_api_key,
model_name="gpt-4-turbo-preview",
temperature=0.7,
max_tokens=200
)
@app.get("/")
def read_root():
return {"Hello": "World"}
class Query(BaseModel):
query_text: str
prompt = ChatPromptTemplate.from_template(
"""
You are a helpful assistant designed specifically for the Thapar Institute of Engineering and Technology (TIET), a renowned technical college. Your task is to answer all queries related to TIET in a concise manner. Every response you provide should be relevant to the context of TIET. If a question falls outside of this context, please decline by stating, 'Sorry, I cannot help with that.' If you do not know the answer to a question, do not attempt to fabricate a response; instead, politely decline.
If the query is not related to TIET or falls outside the context of education, respond with:
"Sorry, I cannot help with that. I'm specifically designed to answer questions about the Thapar Institute of Engineering and Technology.
For more information, please contact our toll-free number: 18002024100 or E-mail us at admissions@thapar.edu
<context>
{context}
</context>
Question: {input}
"""
)
def vector_embedding():
try:
file_path = "./data/Data.docx"
if not os.path.exists(file_path):
print(f"The file {file_path} does not exist.")
return {"response": "Error: Data file not found"}
loader = DocxLoader(file_path)
documents = loader.load()
print(f"Loaded document: {file_path}")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks.")
model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True}
model_norm = HuggingFaceBgeEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
# Save FAISS vector store to a writable directory
db = FAISS.from_documents(chunks, model_norm)
db.save_local(writable_dir)
print(f"Vector store created and saved successfully to {writable_dir}.")
return {"response": "Vector Store DB Is Ready"}
except Exception as e:
print(f"An error occurred: {str(e)}")
return {"response": f"Error: {str(e)}"}
def get_embeddings():
model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True}
model_norm = HuggingFaceBgeEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
return model_norm
@app.post("/chat")
def read_item(query: Query):
try:
embeddings = get_embeddings()
vectors = FAISS.load_local(writable_dir, embeddings, allow_dangerous_deserialization=True)
except Exception as e:
print(f"Error loading vector store: {str(e)}")
return {"response": "Vector Store Not Found or Error Loading. Please run /setup first."}
prompt1 = query.query_text
if prompt1:
start = time.process_time()
document_chain = create_stuff_documents_chain(llm, prompt)
retriever = vectors.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)
response = retrieval_chain.invoke({'input': prompt1})
print("Response time:", time.process_time() - start)
# Apply the cleaning function to the response
cleaned_response = clean_response(response['answer'])
print("Cleaned response:", repr(cleaned_response))
return {"response": cleaned_response}
else:
return {"response": "No Query Found"}
@app.get("/setup")
def setup():
return vector_embedding()
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
|