Spaces:

suneeln-duke
/

nexus-apis

Runtime error

App Files Files Community

suneeln-duke commited on Apr 27

Commit

77961ad

•

1 Parent(s): fe82b46

.

Browse files

Files changed (5) hide show

Dockerfile +22 -0
main.py +78 -0
requirements.txt +18 -0
scripts/mongo_utils.py +145 -0
scripts/rag_utils.py +135 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM python:3.11.2
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN apt update && apt install -y ffmpeg
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+ENV H2O_WAVE_LISTEN=":7860"
+ENV H2O_WAVE_ADDRESS="http://127.0.0.1:7860"

main.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from fastapi import FastAPI, Request
+from Flask import jsonify
+from scripts import mongo_utils
+from scripts import rag_utils
+from dotenv import load_dotenv
+import os
+load_dotenv()
+app = FastAPI
+client = mongo_utils.connect_to_mongo()
+print("Connected to MongoDB")
+def captitalize_name(name):
+    name_split = name.split("_")
+    return " ".join([x.capitalize() for x in name_split])
+@app.post('/summ')
+def summarize(request: Request):
+    pdf_path = request.body()['pdf_path']
+    text = request.body()['text']
+    vs = mongo_utils.get_vs(pdf_path, client)
+    summary = rag_utils.summ(vs, text)
+    return {'summary': summary}
+@app.post('/clf')
+def classify(request: Request):
+    pdf_path = request.body()['pdf_path']
+    text = request.body()['text']
+    vs = mongo_utils.get_vs(pdf_path, client)
+    decision = rag_utils.clf_seq(vs, text).lower()
+    return jsonify({'decision': decision})
+@app.post('/options')
+def options(request: Request):
+    pdf_path = request.body()['pdf_path']
+    text = request.body()['text']
+    vs = mongo_utils.get_vs(pdf_path, client)
+    options = eval(rag_utils.gen_options(vs, text))
+    return jsonify({'options': options})
+@app.post('/path')
+def path(request: Request):
+    pdf_path = request.body()['pdf_path']
+    text = request.body()['text']
+    decision = request.body()['decision']
+    vs = mongo_utils.get_vs(pdf_path, client)
+    path = rag_utils.gen_path(vs, text, decision)
+    return jsonify({'path': path})
+if __name__ == '__main__':
+    app.run()

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+fastapi
+uvicorn
+flask==3.0.3
+pypdf==4.2.0
+pypdf2==3.0.1
+pymongo==4.7.0
+langchain==0.1.16
+langchain_community==0.0.34
+langchain_core==0.1.46
+langchain_openai==0.0.2
+openai>=0.26.2,<=1.6.1
+pandas==2.2.2
+scikit-learn==1.4.2
+seaborn==0.13.2
+matplotlib==3.8.4
+python-dotenv==1.0.1
+certifi
+Flask-CORS

scripts/mongo_utils.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from pymongo import MongoClient
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import MongoDBAtlasVectorSearch
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import sys, os, certifi
+from dotenv import load_dotenv
+from pathlib import Path
+import PyPDF2
+sys.path.append("..")
+load_dotenv()
+os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")
+def read_pages(pdf_file):
+    pages = []
+    reader = PyPDF2.PdfReader(pdf_file)
+    for page_number in range(len(reader.pages)):
+        page = reader.pages[page_number]
+        page_content = page.extract_text()
+        pages.append(page_content)
+    return pages
+def connect_to_mongo():
+    ca = certifi.where()
+    client = MongoClient(os.environ.get("MONGO_URI"), tlsCAFile=ca)
+    # Send a ping to confirm a successful connection
+    try:
+        client.admin.command('ping')
+        print("Pinged your deployment. You successfully connected to MongoDB!")
+    except Exception as e:
+        print(e)
+    return client
+def insert_pages(pdf_file, client=None):
+    pages = read_pages(pdf_file)
+    name = Path(pdf_file).stem
+    pages_dict = [{"text": page, "page": i, "source": name} for i, page in enumerate(pages)]
+    if not client:
+        client = connect_to_mongo()
+    pages_db = client[os.environ.get("MONGO_PAGES_DB")]
+    pages_collection = pages_db[f"{name}-pages"]
+    pages_collection.insert_many(pages_dict)
+    return list(pages_collection.find())
+def get_pages(name, client=None):
+    if not client:
+        client = connect_to_mongo()
+    pages_db = client[os.environ.get("MONGO_PAGES_DB")]
+    if f"{name}-pages" not in pages_db.list_collection_names():
+        print("inserting pages")
+        return insert_pages(name, client=client)
+    else:
+        print("using existing page collection")
+        pages_collection = pages_db[f"{name}-pages"]
+        pages = list(pages_collection.find())
+        return pages
+def insert_vs(pdf_file, client=None):
+    name = Path(pdf_file).stem
+    if not client:
+        client = connect_to_mongo()
+    vs_db = client[os.environ.get("MONGO_VS_DB")]
+    vs_collection = vs_db[f"{name}-vs"]
+    loader = PyPDFLoader(pdf_file)
+    data = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300,
+                                                   chunk_overlap=100)
+    chunks = text_splitter.split_documents(data)
+    embeddings = OpenAIEmbeddings(openai_api_key=os.environ.get("OPENAI_API_KEY"),
+                                disallowed_special=())
+    # Create embeddings in atlas vector store
+    vector_search = MongoDBAtlasVectorSearch.from_documents(
+                                    documents=chunks,
+                                    embedding= embeddings,
+                                    collection=vs_collection,
+                                    index_name=os.environ.get("MONGO_INDEX_DB")
+                                                        )
+    return vector_search
+def get_vs(name, client=None):
+    if not client:
+        client = connect_to_mongo()
+    vs_db = client[os.environ.get("MONGO_VS_DB")]
+    if f"{name}-vs" not in vs_db.list_collection_names():
+        print("inserting vs")
+        return insert_vs(name, client=client)
+    else:
+        print("using existing vs collection")
+        vector_search = MongoDBAtlasVectorSearch.from_connection_string(
+            os.environ.get("MONGO_URI"),
+            os.environ.get("MONGO_VS_DB") + "." + f"{name}-vs",
+            OpenAIEmbeddings(openai_api_key=os.environ.get("OPENAI_API_KEY"),
+                                    disallowed_special=()),
+            index_name=os.environ.get("MONGO_INDEX_DB"),
+        )
+        return vector_search

scripts/rag_utils.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from langchain_openai.chat_models import ChatOpenAI
+from langchain_core.prompts import PromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
+import os, sys
+from dotenv import load_dotenv
+load_dotenv()
+sys.path.append("..")
+os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")
+def prep_config(vs):
+    retriever = vs.as_retriever(
+                search_type = "similarity",
+                search_kwargs = {"k": 3}
+                )
+    template = """Answer the question: {question} based only on the following context:
+    context: {context}
+    """
+    output_parser = JsonOutputParser()
+    prompt = PromptTemplate.from_template(template = template,
+                        input_varaibles = ["context", "question"],
+                        output_variables = ["answer"],)
+    output_parser = StrOutputParser()
+    model = ChatOpenAI(openai_api_key=os.environ.get("OPENAI_API_KEY"),
+                model_name = 'gpt-4',
+                temperature=0.3)
+    def format_docs(docs):
+        return "\n\n".join(doc.page_content for doc in docs)
+    retrieval_chain = (
+        {"context": retriever | format_docs,  "question": RunnablePassthrough()}
+        | prompt
+        | model
+        | output_parser
+    )
+    return retrieval_chain, output_parser
+def gen_options(vs, text):
+    retrieval_chain, output_parser = prep_config(vs)
+    query = f"""
+    Act as the author of a Choose Your Own Adventure Book. This book is special as it is based on existing material.
+    Now, as with any choose your own adventure book, you'll have to generate decision paths based on the given story excerpt
+    Your job is to generate 4 decision paths for the given point in the story.
+    One among the 4 decision paths should be the original path, the other 3 should deviate from the original path in a sensible manner.
+    The decision paths should be generated in a way that they are coherent with the existing story.
+    Limit each decision path to a succint sentence.
+    Return the 4 decision paths as a list of strings.
+    Story Excerpt: {text}
+    """
+    response = retrieval_chain.invoke(query)
+    return response
+def gen_path(vs, text, decision):
+    retrieval_chain, output_parser = prep_config(vs)
+    query = f"""
+    Act as the author of a Choose Your Own Adventure Book. This book is special as it is based on existing material.
+    Now, as with any choose your own adventure book, you'll have to generate new story paths based on a relevant excerpt of the story and the decision taken.
+    Your job is to generate the next part of the story based on the given part of the story and the decision taken.
+    The new story path should be coherent with the existing story, and should be around 6-8 sentences.
+    If the decision string is empty, your task is just to generate the next part of the story based on the given part of the story.
+    Return the new story path as a string.
+    Story Excerpt: {text}
+    Decision: {decision}
+    """
+    response = retrieval_chain.invoke(query)
+    return output_parser.parse(response)
+def clf_seq(vs, text):
+    retrieval_chain, output_parser = prep_config(vs)
+    query = f"""
+    Classify whether the given chunk involves a decision that will effect the story or not.
+    A decision is defined as when the character goes about making a choice between two or more options.
+    The decision should be significant enough to affect the story in a major way.
+    It doesn't really involve emotions, feelings or thoughts, but what the character does, or what happens to them.
+    This involes interactions between characters, or the character and the environment.
+    What isn't a decision is chunks describing the setting, or the character's thoughts or feelings.
+    Return the answer as the corresponding decision label "yes" or "no"
+    {text}
+     """
+    response = retrieval_chain.invoke(query)
+    return output_parser.parse(response)
+def summ(vs, text):
+    retrieval_chain, output_parser = prep_config(vs)
+    query = f"""
+    Summarize the given text in a narrative manner as a part of storytelling.
+    The summary should be around 3-4 sentences and should be coherent with the existing story.
+    Return the summary as a string.
+    {text}
+     """
+    response = retrieval_chain.invoke(query)
+    return output_parser.parse(response)