diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..81ed570aded5304a64bc9f91ea7e60e3758bccdf
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,29 @@
+FROM python:3.11-slim-bullseye as builder
+
+# Tiktoken requires Rust toolchain, so build it in a separate stage
+RUN apt-get update && apt-get install -y gcc curl
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && apt-get install --reinstall libc6-dev -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN pip install --upgrade pip && pip install tiktoken==0.5.2
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+RUN apt-get install -y wget unzip
+RUN wget https://d3dg1063dc54p9.cloudfront.net/models/embeddings/mpnet-base-v2.zip
+RUN unzip mpnet-base-v2.zip -d model
+RUN rm mpnet-base-v2.zip
+
+FROM python:3.11-slim-bullseye
+
+# Copy pre-built packages and binaries from builder stage
+COPY --from=builder /usr/local/ /usr/local/
+
+WORKDIR /app
+COPY --from=builder /model /app/model
+
+COPY . /app/application
+ENV FLASK_APP=app.py
+ENV FLASK_DEBUG=true
+
+EXPOSE 7091
+
+CMD ["gunicorn", "-w", "2", "--timeout", "120", "--bind", "0.0.0.0:7091", "application.wsgi:app"]
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/api/__init__.py b/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/api/__pycache__/__init__.cpython-310.pyc b/api/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82dd1448e4ab7a588e98145c9ada38adf8341c28
Binary files /dev/null and b/api/__pycache__/__init__.cpython-310.pyc differ
diff --git a/api/answer/__init__.py b/api/answer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/api/answer/__pycache__/__init__.cpython-310.pyc b/api/answer/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c70fdb2ad7bb4caf0568659d439d7ffbb64ea4c
Binary files /dev/null and b/api/answer/__pycache__/__init__.cpython-310.pyc differ
diff --git a/api/answer/__pycache__/routes.cpython-310.pyc b/api/answer/__pycache__/routes.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a13a7ea2c0c6756344941ae445611f9376c6693
Binary files /dev/null and b/api/answer/__pycache__/routes.cpython-310.pyc differ
diff --git a/api/answer/routes.py b/api/answer/routes.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c661e0bdad49ff1a7e84580214bfed1e19435c3
--- /dev/null
+++ b/api/answer/routes.py
@@ -0,0 +1,371 @@
+import asyncio
+import os
+from flask import Blueprint, request, Response
+import json
+import datetime
+import logging
+import traceback
+
+from pymongo import MongoClient
+from bson.objectid import ObjectId
+from transformers import GPT2TokenizerFast
+
+
+
+from application.core.settings import settings
+from application.vectorstore.vector_creator import VectorCreator
+from application.llm.llm_creator import LLMCreator
+from application.error import bad_request
+
+
+
+logger = logging.getLogger(__name__)
+
+mongo = MongoClient(settings.MONGO_URI)
+db = mongo["docsgpt"]
+conversations_collection = db["conversations"]
+vectors_collection = db["vectors"]
+prompts_collection = db["prompts"]
+answer = Blueprint('answer', __name__)
+
+if settings.LLM_NAME == "gpt4":
+    gpt_model = 'gpt-4'
+elif settings.LLM_NAME == "anthropic":
+    gpt_model = 'claude-2'
+else:
+    gpt_model = 'gpt-3.5-turbo'
+
+# load the prompts
+current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+with open(os.path.join(current_dir, "prompts", "chat_combine_default.txt"), "r") as f:
+    chat_combine_template = f.read()
+
+with open(os.path.join(current_dir, "prompts", "chat_reduce_prompt.txt"), "r") as f:
+    chat_reduce_template = f.read()
+
+with open(os.path.join(current_dir, "prompts", "chat_combine_creative.txt"), "r") as f:
+    chat_combine_creative = f.read()
+
+with open(os.path.join(current_dir, "prompts", "chat_combine_strict.txt"), "r") as f:
+    chat_combine_strict = f.read()    
+
+api_key_set = settings.API_KEY is not None
+embeddings_key_set = settings.EMBEDDINGS_KEY is not None
+
+
+async def async_generate(chain, question, chat_history):
+    result = await chain.arun({"question": question, "chat_history": chat_history})
+    return result
+
+
+def count_tokens(string):
+    tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
+    return len(tokenizer(string)['input_ids'])
+
+
+def run_async_chain(chain, question, chat_history):
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    result = {}
+    try:
+        answer = loop.run_until_complete(async_generate(chain, question, chat_history))
+    finally:
+        loop.close()
+    result["answer"] = answer
+    return result
+
+
+def get_vectorstore(data):
+    if "active_docs" in data:
+        if data["active_docs"].split("/")[0] == "default":
+                vectorstore = ""
+        elif data["active_docs"].split("/")[0] == "local":
+            vectorstore = "indexes/" + data["active_docs"]
+        else:
+            vectorstore = "vectors/" + data["active_docs"]
+        if data["active_docs"] == "default":
+            vectorstore = ""
+    else:
+        vectorstore = ""
+    vectorstore = os.path.join("application", vectorstore)
+    return vectorstore
+
+
+def is_azure_configured():
+    return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME
+
+
+def complete_stream(question, docsearch, chat_history, api_key, prompt_id, conversation_id):
+    llm = LLMCreator.create_llm(settings.LLM_NAME, api_key=api_key)
+
+    if prompt_id == 'default':
+        prompt = chat_combine_template
+    elif prompt_id == 'creative':
+        prompt = chat_combine_creative
+    elif prompt_id == 'strict':
+        prompt = chat_combine_strict
+    else:
+        prompt = prompts_collection.find_one({"_id": ObjectId(prompt_id)})["content"]
+
+    docs = docsearch.search(question, k=2)
+    if settings.LLM_NAME == "llama.cpp":
+        docs = [docs[0]]
+    # join all page_content together with a newline
+    docs_together = "\n".join([doc.page_content for doc in docs])
+    p_chat_combine = prompt.replace("{summaries}", docs_together)
+    messages_combine = [{"role": "system", "content": p_chat_combine}]
+    source_log_docs = []
+    for doc in docs:
+        if doc.metadata:
+            source_log_docs.append({"title": doc.metadata['title'].split('/')[-1], "text": doc.page_content})
+        else:
+            source_log_docs.append({"title": doc.page_content, "text": doc.page_content})
+
+    if len(chat_history) > 1:
+        tokens_current_history = 0
+        # count tokens in history
+        chat_history.reverse()
+        for i in chat_history:
+            if "prompt" in i and "response" in i:
+                tokens_batch = count_tokens(i["prompt"]) + count_tokens(i["response"])
+                if tokens_current_history + tokens_batch < settings.TOKENS_MAX_HISTORY:
+                    tokens_current_history += tokens_batch
+                    messages_combine.append({"role": "user", "content": i["prompt"]})
+                    messages_combine.append({"role": "system", "content": i["response"]})
+    messages_combine.append({"role": "user", "content": question})
+
+    response_full = ""
+    completion = llm.gen_stream(model=gpt_model, engine=settings.AZURE_DEPLOYMENT_NAME,
+                                messages=messages_combine)
+    for line in completion:
+        data = json.dumps({"answer": str(line)})
+        response_full += str(line)
+        yield f"data: {data}\n\n"
+
+    # save conversation to database
+    if conversation_id is not None:
+        conversations_collection.update_one(
+            {"_id": ObjectId(conversation_id)},
+            {"$push": {"queries": {"prompt": question, "response": response_full, "sources": source_log_docs}}},
+        )
+
+    else:
+        # create new conversation
+        # generate summary
+        messages_summary = [{"role": "assistant", "content": "Summarise following conversation in no more than 3 "
+                                                             "words, respond ONLY with the summary, use the same "
+                                                             "language as the system \n\nUser: " + question + "\n\n" +
+                                                             "AI: " +
+                                                             response_full},
+                            {"role": "user", "content": "Summarise following conversation in no more than 3 words, "
+                                                        "respond ONLY with the summary, use the same language as the "
+                                                        "system"}]
+
+        completion = llm.gen(model=gpt_model, engine=settings.AZURE_DEPLOYMENT_NAME,
+                             messages=messages_summary, max_tokens=30)
+        conversation_id = conversations_collection.insert_one(
+            {"user": "local",
+             "date": datetime.datetime.utcnow(),
+             "name": completion,
+             "queries": [{"prompt": question, "response": response_full, "sources": source_log_docs}]}
+        ).inserted_id
+
+    # send data.type = "end" to indicate that the stream has ended as json
+    data = json.dumps({"type": "id", "id": str(conversation_id)})
+    yield f"data: {data}\n\n"
+    data = json.dumps({"type": "end"})
+    yield f"data: {data}\n\n"
+
+
+@answer.route("/stream", methods=["POST"])
+def stream():
+    data = request.get_json()
+    # get parameter from url question
+    question = data["question"]
+    history = data["history"]
+    # history to json object from string
+    history = json.loads(history)
+    conversation_id = data["conversation_id"]
+    if 'prompt_id' in data:
+        prompt_id = data["prompt_id"]
+    else:
+        prompt_id = 'default'
+
+    # check if active_docs is set
+
+    if not api_key_set:
+        api_key = data["api_key"]
+    else:
+        api_key = settings.API_KEY
+    if not embeddings_key_set:
+        embeddings_key = data["embeddings_key"]
+    else:
+        embeddings_key = settings.EMBEDDINGS_KEY
+    if "active_docs" in data:
+        vectorstore = get_vectorstore({"active_docs": data["active_docs"]})
+    else:
+        vectorstore = ""
+    docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key)
+
+    return Response(
+        complete_stream(question, docsearch,
+                        chat_history=history, api_key=api_key,
+                        prompt_id=prompt_id,
+                        conversation_id=conversation_id), mimetype="text/event-stream"
+    )
+
+
+@answer.route("/api/answer", methods=["POST"])
+def api_answer():
+    data = request.get_json()
+    question = data["question"]
+    history = data["history"]
+    if "conversation_id" not in data:
+        conversation_id = None
+    else:
+        conversation_id = data["conversation_id"]
+    print("-" * 5)
+    if not api_key_set:
+        api_key = data["api_key"]
+    else:
+        api_key = settings.API_KEY
+    if not embeddings_key_set:
+        embeddings_key = data["embeddings_key"]
+    else:
+        embeddings_key = settings.EMBEDDINGS_KEY
+    if 'prompt_id' in data:
+        prompt_id = data["prompt_id"]
+    else:
+        prompt_id = 'default'
+
+    if prompt_id == 'default':
+        prompt = chat_combine_template
+    elif prompt_id == 'creative':
+        prompt = chat_combine_creative
+    elif prompt_id == 'strict':
+        prompt = chat_combine_strict
+    else:
+        prompt = prompts_collection.find_one({"_id": ObjectId(prompt_id)})["content"]
+
+    # use try and except  to check for exception
+    try:
+        # check if the vectorstore is set
+        vectorstore = get_vectorstore(data)
+        # loading the index and the store and the prompt template
+        # Note if you have used other embeddings than OpenAI, you need to change the embeddings
+        docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key)
+
+
+        llm = LLMCreator.create_llm(settings.LLM_NAME, api_key=api_key)
+
+
+
+        docs = docsearch.search(question, k=2)
+        # join all page_content together with a newline
+        docs_together = "\n".join([doc.page_content for doc in docs])
+        p_chat_combine = prompt.replace("{summaries}", docs_together)
+        messages_combine = [{"role": "system", "content": p_chat_combine}]
+        source_log_docs = []
+        for doc in docs:
+            if doc.metadata:
+                source_log_docs.append({"title": doc.metadata['title'].split('/')[-1], "text": doc.page_content})
+            else:
+                source_log_docs.append({"title": doc.page_content, "text": doc.page_content})
+        # join all page_content together with a newline
+
+
+        if len(history) > 1:
+            tokens_current_history = 0
+            # count tokens in history
+            history.reverse()
+            for i in history:
+                if "prompt" in i and "response" in i:
+                    tokens_batch = count_tokens(i["prompt"]) + count_tokens(i["response"])
+                    if tokens_current_history + tokens_batch < settings.TOKENS_MAX_HISTORY:
+                        tokens_current_history += tokens_batch
+                        messages_combine.append({"role": "user", "content": i["prompt"]})
+                        messages_combine.append({"role": "system", "content": i["response"]})
+        messages_combine.append({"role": "user", "content": question})
+
+
+        completion = llm.gen(model=gpt_model, engine=settings.AZURE_DEPLOYMENT_NAME,
+                                    messages=messages_combine)
+
+
+        result = {"answer": completion, "sources": source_log_docs}
+        logger.debug(result)
+
+        # generate conversationId
+        if conversation_id is not None:
+            conversations_collection.update_one(
+                {"_id": ObjectId(conversation_id)},
+                {"$push": {"queries": {"prompt": question,
+                                       "response": result["answer"], "sources": result['sources']}}},
+            )
+
+        else:
+            # create new conversation
+            # generate summary
+            messages_summary = [
+                {"role": "assistant", "content": "Summarise following conversation in no more than 3 words, "
+                    "respond ONLY with the summary, use the same language as the system \n\n"
+                    "User: " + question + "\n\n" + "AI: " + result["answer"]},
+                {"role": "user", "content": "Summarise following conversation in no more than 3 words, "
+                    "respond ONLY with the summary, use the same language as the system"}
+            ]
+
+            completion = llm.gen(
+                model=gpt_model,
+                engine=settings.AZURE_DEPLOYMENT_NAME,
+                messages=messages_summary,
+                max_tokens=30
+            )
+            conversation_id = conversations_collection.insert_one(
+                {"user": "local",
+                "date": datetime.datetime.utcnow(),
+                "name": completion,
+                "queries": [{"prompt": question, "response": result["answer"], "sources": source_log_docs}]}
+            ).inserted_id
+
+        result["conversation_id"] = str(conversation_id)
+
+        # mock result
+        # result = {
+        #     "answer": "The answer is 42",
+        #     "sources": ["https://en.wikipedia.org/wiki/42_(number)", "https://en.wikipedia.org/wiki/42_(number)"]
+        # }
+        return result
+    except Exception as e:
+        # print whole traceback
+        traceback.print_exc()
+        print(str(e))
+        return bad_request(500, str(e))
+
+
+@answer.route("/api/search", methods=["POST"])
+def api_search():
+    data = request.get_json()
+    # get parameter from url question
+    question = data["question"]
+
+    if not embeddings_key_set:
+        embeddings_key = data["embeddings_key"]
+    else:
+        embeddings_key = settings.EMBEDDINGS_KEY
+    if "active_docs" in data:
+        vectorstore = get_vectorstore({"active_docs": data["active_docs"]})
+    else:
+        vectorstore = ""
+    docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key)
+
+    docs = docsearch.search(question, k=2)
+
+    source_log_docs = []
+    for doc in docs:
+        if doc.metadata:
+            source_log_docs.append({"title": doc.metadata['title'].split('/')[-1], "text": doc.page_content})
+        else:
+            source_log_docs.append({"title": doc.page_content, "text": doc.page_content})
+        #yield f"data:{data}\n\n"
+    return source_log_docs
+
diff --git a/api/internal/__init__.py b/api/internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/api/internal/__pycache__/__init__.cpython-310.pyc b/api/internal/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b12b34e3892e0c1b90eb2ab655589176c16a41c3
Binary files /dev/null and b/api/internal/__pycache__/__init__.cpython-310.pyc differ
diff --git a/api/internal/__pycache__/routes.cpython-310.pyc b/api/internal/__pycache__/routes.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7f4e03e1c61968bbc4d3b4d28e9730bc5cc707e
Binary files /dev/null and b/api/internal/__pycache__/routes.cpython-310.pyc differ
diff --git a/api/internal/routes.py b/api/internal/routes.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8a1b80b151601de3b8a70392449f75f0a203c55
--- /dev/null
+++ b/api/internal/routes.py
@@ -0,0 +1,69 @@
+import os
+import datetime
+from flask import Blueprint, request, send_from_directory
+from pymongo import MongoClient
+from werkzeug.utils import secure_filename
+
+
+from application.core.settings import settings
+mongo = MongoClient(settings.MONGO_URI)
+db = mongo["docsgpt"]
+conversations_collection = db["conversations"]
+vectors_collection = db["vectors"]
+
+current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+internal = Blueprint('internal', __name__)
+@internal.route("/api/download", methods=["get"])
+def download_file():
+    user = secure_filename(request.args.get("user"))
+    job_name = secure_filename(request.args.get("name"))
+    filename = secure_filename(request.args.get("file"))
+    save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name)
+    return send_from_directory(save_dir, filename, as_attachment=True)
+
+
+
+@internal.route("/api/upload_index", methods=["POST"])
+def upload_index_files():
+    """Upload two files(index.faiss, index.pkl) to the user's folder."""
+    if "user" not in request.form:
+        return {"status": "no user"}
+    user = secure_filename(request.form["user"])
+    if "name" not in request.form:
+        return {"status": "no name"}
+    job_name = secure_filename(request.form["name"])
+    save_dir = os.path.join(current_dir, "indexes", user, job_name)
+    if settings.VECTOR_STORE == "faiss":
+        if "file_faiss" not in request.files:
+            print("No file part")
+            return {"status": "no file"}
+        file_faiss = request.files["file_faiss"]
+        if file_faiss.filename == "":
+            return {"status": "no file name"}
+        if "file_pkl" not in request.files:
+            print("No file part")
+            return {"status": "no file"}
+        file_pkl = request.files["file_pkl"]
+        if file_pkl.filename == "":
+            return {"status": "no file name"}
+        # saves index files
+        
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        file_faiss.save(os.path.join(save_dir, "index.faiss"))
+        file_pkl.save(os.path.join(save_dir, "index.pkl"))
+    # create entry in vectors_collection
+    vectors_collection.insert_one(
+        {
+            "user": user,
+            "name": job_name,
+            "language": job_name,
+            "location": save_dir,
+            "date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
+            "model": settings.EMBEDDINGS_NAME,
+            "type": "local",
+        }
+    )
+    return {"status": "ok"}
\ No newline at end of file
diff --git a/api/user/__init__.py b/api/user/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/api/user/__pycache__/__init__.cpython-310.pyc b/api/user/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba17d39fe5a603209181f8748c5694fab37ccebf
Binary files /dev/null and b/api/user/__pycache__/__init__.cpython-310.pyc differ
diff --git a/api/user/__pycache__/routes.cpython-310.pyc b/api/user/__pycache__/routes.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d181c06e65243409e490bc8ef3e80d1fe8a6ea4
Binary files /dev/null and b/api/user/__pycache__/routes.cpython-310.pyc differ
diff --git a/api/user/__pycache__/tasks.cpython-310.pyc b/api/user/__pycache__/tasks.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..550c3aea49a511d99a305537ad9f4c416a1b7812
Binary files /dev/null and b/api/user/__pycache__/tasks.cpython-310.pyc differ
diff --git a/api/user/routes.py b/api/user/routes.py
new file mode 100644
index 0000000000000000000000000000000000000000..867425721026c2642e3ed5d201f68f192b6add6e
--- /dev/null
+++ b/api/user/routes.py
@@ -0,0 +1,321 @@
+import os
+from flask import Blueprint, request, jsonify
+import requests
+from pymongo import MongoClient
+from bson.objectid import ObjectId
+from werkzeug.utils import secure_filename
+
+from application.api.user.tasks import ingest
+
+from application.core.settings import settings
+from application.vectorstore.vector_creator import VectorCreator
+
+mongo = MongoClient(settings.MONGO_URI)
+db = mongo["docsgpt"]
+conversations_collection = db["conversations"]
+vectors_collection = db["vectors"]
+prompts_collection = db["prompts"]
+feedback_collection = db["feedback"]
+user = Blueprint('user', __name__)
+
+current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+@user.route("/api/delete_conversation", methods=["POST"])
+def delete_conversation():
+    # deletes a conversation from the database
+    conversation_id = request.args.get("id")
+    # write to mongodb
+    conversations_collection.delete_one(
+        {
+            "_id": ObjectId(conversation_id),
+        }
+    )
+
+    return {"status": "ok"}
+
+@user.route("/api/get_conversations", methods=["get"])
+def get_conversations():
+    # provides a list of conversations
+    conversations = conversations_collection.find().sort("date", -1)
+    list_conversations = []
+    for conversation in conversations:
+        list_conversations.append({"id": str(conversation["_id"]), "name": conversation["name"]})
+
+    #list_conversations = [{"id": "default", "name": "default"}, {"id": "jeff", "name": "jeff"}]
+
+    return jsonify(list_conversations)
+
+
+@user.route("/api/get_single_conversation", methods=["get"])
+def get_single_conversation():
+    # provides data for a conversation
+    conversation_id = request.args.get("id")
+    conversation = conversations_collection.find_one({"_id": ObjectId(conversation_id)})
+    return jsonify(conversation['queries'])
+
+@user.route("/api/update_conversation_name", methods=["POST"])
+def update_conversation_name():
+    # update data for a conversation
+    data = request.get_json()
+    id = data["id"]
+    name = data["name"]
+    conversations_collection.update_one({"_id": ObjectId(id)},{"$set":{"name":name}})
+    return {"status": "ok"}
+
+
+@user.route("/api/feedback", methods=["POST"])
+def api_feedback():
+    data = request.get_json()
+    question = data["question"]
+    answer = data["answer"]
+    feedback = data["feedback"]
+
+
+    feedback_collection.insert_one(
+        {
+            "question": question,
+            "answer": answer,
+            "feedback": feedback,
+        }
+    )
+    return {"status": "ok"}
+
+@user.route("/api/delete_by_ids", methods=["get"])
+def delete_by_ids():
+    """Delete by ID. These are the IDs in the vectorstore"""
+
+    ids = request.args.get("path")
+    if not ids:
+        return {"status": "error"}
+
+    if settings.VECTOR_STORE == "faiss":
+        result = vectors_collection.delete_index(ids=ids)
+        if result:
+            return {"status": "ok"}
+    return {"status": "error"}
+
+@user.route("/api/delete_old", methods=["get"])
+def delete_old():
+    """Delete old indexes."""
+    import shutil
+
+    path = request.args.get("path")
+    dirs = path.split("/")
+    dirs_clean = []
+    for i in range(0, len(dirs)):
+        dirs_clean.append(secure_filename(dirs[i]))
+    # check that path strats with indexes or vectors
+
+    if dirs_clean[0] not in ["indexes", "vectors"]:
+        return {"status": "error"}
+    path_clean = "/".join(dirs_clean)
+    vectors_collection.delete_one({"name": dirs_clean[-1], 'user': dirs_clean[-2]})
+    if settings.VECTOR_STORE == "faiss":
+        try:
+            shutil.rmtree(os.path.join(current_dir, path_clean))
+        except FileNotFoundError:
+            pass
+    else:
+        vetorstore = VectorCreator.create_vectorstore(
+            settings.VECTOR_STORE, path=os.path.join(current_dir, path_clean)
+        )
+        vetorstore.delete_index()
+        
+    return {"status": "ok"}
+
+@user.route("/api/upload", methods=["POST"])
+def upload_file():
+    """Upload a file to get vectorized and indexed."""
+    if "user" not in request.form:
+        return {"status": "no user"}
+    user = secure_filename(request.form["user"])
+    if "name" not in request.form:
+        return {"status": "no name"}
+    job_name = secure_filename(request.form["name"])
+    # check if the post request has the file part
+    if "file" not in request.files:
+        print("No file part")
+        return {"status": "no file"}
+    file = request.files["file"]
+    if file.filename == "":
+        return {"status": "no file name"}
+
+    if file:
+        filename = secure_filename(file.filename)
+        # save dir
+        save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name)
+        # create dir if not exists
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+
+        file.save(os.path.join(save_dir, filename))
+        task = ingest.delay(settings.UPLOAD_FOLDER, [".rst", ".md", ".pdf", ".txt", ".docx", 
+        ".csv", ".epub", ".html", ".mdx"],
+         job_name, filename, user)
+        # task id
+        task_id = task.id
+        return {"status": "ok", "task_id": task_id}
+    else:
+        return {"status": "error"}
+
+@user.route("/api/task_status", methods=["GET"])
+def task_status():
+    """Get celery job status."""
+    task_id = request.args.get("task_id")
+    from application.celery import celery
+    task = celery.AsyncResult(task_id)
+    task_meta = task.info
+    return {"status": task.status, "result": task_meta}
+
+
+@user.route("/api/combine", methods=["GET"])
+def combined_json():
+    user = "local"
+    """Provide json file with combined available indexes."""
+    # get json from https://d3dg1063dc54p9.cloudfront.net/combined.json
+
+    data = [
+        {
+            "name": "default",
+            "language": "default",
+            "version": "",
+            "description": "default",
+            "fullName": "default",
+            "date": "default",
+            "docLink": "default",
+            "model": settings.EMBEDDINGS_NAME,
+            "location": "remote",
+        }
+    ]
+    # structure: name, language, version, description, fullName, date, docLink
+    # append data from vectors_collection
+    for index in vectors_collection.find({"user": user}):
+        data.append(
+            {
+                "name": index["name"],
+                "language": index["language"],
+                "version": "",
+                "description": index["name"],
+                "fullName": index["name"],
+                "date": index["date"],
+                "docLink": index["location"],
+                "model": settings.EMBEDDINGS_NAME,
+                "location": "local",
+            }
+        )
+    if settings.VECTOR_STORE == "faiss":
+        data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json()
+        for index in data_remote:
+            index["location"] = "remote"
+            data.append(index)
+
+    return jsonify(data)
+
+
+@user.route("/api/docs_check", methods=["POST"])
+def check_docs():
+    # check if docs exist in a vectorstore folder
+    data = request.get_json()
+    # split docs on / and take first part
+    if data["docs"].split("/")[0] == "local":
+        return {"status": "exists"}
+    vectorstore = "vectors/" + data["docs"]
+    base_path = "https://raw.githubusercontent.com/arc53/DocsHUB/main/"
+    if os.path.exists(vectorstore) or data["docs"] == "default":
+        return {"status": "exists"}
+    else:
+        r = requests.get(base_path + vectorstore + "index.faiss")
+
+        if r.status_code != 200:
+            return {"status": "null"}
+        else:
+            if not os.path.exists(vectorstore):
+                os.makedirs(vectorstore)
+            with open(vectorstore + "index.faiss", "wb") as f:
+                f.write(r.content)
+
+            # download the store
+            r = requests.get(base_path + vectorstore + "index.pkl")
+            with open(vectorstore + "index.pkl", "wb") as f:
+                f.write(r.content)
+
+        return {"status": "loaded"}
+
+@user.route("/api/create_prompt", methods=["POST"])
+def create_prompt():
+    data = request.get_json()
+    content = data["content"]
+    name = data["name"]
+    if name == "":
+        return {"status": "error"}
+    user = "local"
+    resp = prompts_collection.insert_one(
+        {
+            "name": name,
+            "content": content,
+            "user": user,
+        }
+    )
+    new_id = str(resp.inserted_id)
+    return {"id": new_id}
+
+@user.route("/api/get_prompts", methods=["GET"])
+def get_prompts():
+    user = "local"
+    prompts = prompts_collection.find({"user": user})
+    list_prompts = []
+    list_prompts.append({"id": "default", "name": "default", "type": "public"})
+    list_prompts.append({"id": "creative", "name": "creative", "type": "public"})
+    list_prompts.append({"id": "strict", "name": "strict", "type": "public"})
+    for prompt in prompts:
+        list_prompts.append({"id": str(prompt["_id"]), "name": prompt["name"], "type": "private"})
+
+    return jsonify(list_prompts)
+
+@user.route("/api/get_single_prompt", methods=["GET"])
+def get_single_prompt():
+    prompt_id = request.args.get("id")
+    if prompt_id == 'default':
+        with open(os.path.join(current_dir, "prompts", "chat_combine_default.txt"), "r") as f:
+            chat_combine_template = f.read()
+        return jsonify({"content": chat_combine_template})
+    elif prompt_id == 'creative':
+        with open(os.path.join(current_dir, "prompts", "chat_combine_creative.txt"), "r") as f:
+            chat_reduce_creative = f.read()
+        return jsonify({"content": chat_reduce_creative})
+    elif prompt_id == 'strict':
+        with open(os.path.join(current_dir, "prompts", "chat_combine_strict.txt"), "r") as f:
+            chat_reduce_strict = f.read()   
+        return jsonify({"content": chat_reduce_strict})
+
+
+    prompt = prompts_collection.find_one({"_id": ObjectId(prompt_id)})
+    return jsonify({"content": prompt["content"]})
+
+@user.route("/api/delete_prompt", methods=["POST"])
+def delete_prompt():
+    data = request.get_json()
+    id = data["id"]
+    prompts_collection.delete_one(
+        {
+            "_id": ObjectId(id),
+        }
+    )
+    return {"status": "ok"}
+
+@user.route("/api/update_prompt", methods=["POST"])
+def update_prompt_name():
+    data = request.get_json()
+    id = data["id"]
+    name = data["name"]
+    content = data["content"]
+    # check if name is null
+    if name == "":
+        return {"status": "error"}
+    prompts_collection.update_one({"_id": ObjectId(id)},{"$set":{"name":name, "content": content}})
+    return {"status": "ok"}
+
+
+
+
+
diff --git a/api/user/tasks.py b/api/user/tasks.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3474939ab203bc860295d748f45dda9c2436135
--- /dev/null
+++ b/api/user/tasks.py
@@ -0,0 +1,7 @@
+from application.worker import ingest_worker
+from application.celery import celery
+
+@celery.task(bind=True)
+def ingest(self, directory, formats, name_job, filename, user):
+    resp = ingest_worker(self, directory, formats, name_job, filename, user)
+    return resp
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae619974974f13e59cdb6301fbe67515316aef51
--- /dev/null
+++ b/app.py
@@ -0,0 +1,44 @@
+import platform
+import dotenv
+from application.celery import celery
+from flask import Flask, request, redirect
+from application.core.settings import settings
+from application.api.user.routes import user
+from application.api.answer.routes import answer
+from application.api.internal.routes import internal
+
+if platform.system() == "Windows":
+    import pathlib
+    pathlib.PosixPath = pathlib.WindowsPath
+
+dotenv.load_dotenv()
+
+app = Flask(__name__)
+app.register_blueprint(user)
+app.register_blueprint(answer)
+app.register_blueprint(internal)
+app.config.update(
+    UPLOAD_FOLDER="inputs",
+    CELERY_BROKER_URL=settings.CELERY_BROKER_URL,
+    CELERY_RESULT_BACKEND=settings.CELERY_RESULT_BACKEND,
+    MONGO_URI=settings.MONGO_URI
+)
+celery.config_from_object("application.celeryconfig")
+
+@app.route("/")
+def home():
+    if request.remote_addr in ('0.0.0.0', '127.0.0.1', 'localhost', '172.18.0.1'):
+        return redirect('http://localhost:5173')
+    else:
+        return 'Welcome to DocsGPT Backend!'
+
+@app.after_request
+def after_request(response):
+    response.headers.add("Access-Control-Allow-Origin", "*")
+    response.headers.add("Access-Control-Allow-Headers", "Content-Type,Authorization")
+    response.headers.add("Access-Control-Allow-Methods", "GET,PUT,POST,DELETE,OPTIONS")
+    return response
+
+if __name__ == "__main__":
+    app.run(debug=True, port=7091)
+
diff --git a/celery.py b/celery.py
new file mode 100644
index 0000000000000000000000000000000000000000..c19c2e75489300b0b95a70ebcc3162a07c1bd3f8
--- /dev/null
+++ b/celery.py
@@ -0,0 +1,9 @@
+from celery import Celery
+from application.core.settings import settings
+
+def make_celery(app_name=__name__):
+    celery = Celery(app_name, broker=settings.CELERY_BROKER_URL, backend=settings.CELERY_RESULT_BACKEND)
+    celery.conf.update(settings)
+    return celery
+
+celery = make_celery()
diff --git a/celeryconfig.py b/celeryconfig.py
new file mode 100644
index 0000000000000000000000000000000000000000..712b3bfc6193b8015ec13c244601aea09abe39e6
--- /dev/null
+++ b/celeryconfig.py
@@ -0,0 +1,8 @@
+import os
+
+broker_url = os.getenv("CELERY_BROKER_URL")
+result_backend = os.getenv("CELERY_RESULT_BACKEND")
+
+task_serializer = 'json'
+result_serializer = 'json'
+accept_content = ['json']
diff --git a/core/__init__.py b/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/core/__pycache__/__init__.cpython-310.pyc b/core/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ab79ff8589d8887c6293676b263e94ebff7bb94
Binary files /dev/null and b/core/__pycache__/__init__.cpython-310.pyc differ
diff --git a/core/__pycache__/settings.cpython-310.pyc b/core/__pycache__/settings.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28afe965994ff14490537514d570e893a8d8f57c
Binary files /dev/null and b/core/__pycache__/settings.cpython-310.pyc differ
diff --git a/core/settings.py b/core/settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..42dea0ff6adebb1a4787f4228f6c015b922eacff
--- /dev/null
+++ b/core/settings.py
@@ -0,0 +1,44 @@
+from pathlib import Path
+from typing import Optional
+import os
+
+from pydantic_settings import BaseSettings
+current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+class Settings(BaseSettings):
+    LLM_NAME: str = "docsgpt"
+    EMBEDDINGS_NAME: str = "huggingface_sentence-transformers/all-mpnet-base-v2"
+    CELERY_BROKER_URL: str = "redis://localhost:6379/0"
+    CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1"
+    MONGO_URI: str = "mongodb://localhost:27017/docsgpt"
+    MODEL_PATH: str = os.path.join(current_dir, "models/docsgpt-7b-f16.gguf")
+    TOKENS_MAX_HISTORY: int = 150
+    UPLOAD_FOLDER: str = "inputs"
+    VECTOR_STORE: str = "faiss"  # "faiss" or "elasticsearch"
+
+    API_URL: str = "http://localhost:7091"  # backend url for celery worker
+
+    API_KEY: Optional[str] = None  # LLM api key
+    EMBEDDINGS_KEY: Optional[str] = None  # api key for embeddings (if using openai, just copy API_KEY)
+    OPENAI_API_BASE: Optional[str] = None  # azure openai api base url
+    OPENAI_API_VERSION: Optional[str] = None  # azure openai api version
+    AZURE_DEPLOYMENT_NAME: Optional[str] = None  # azure deployment name for answering
+    AZURE_EMBEDDINGS_DEPLOYMENT_NAME: Optional[str] = None  # azure deployment name for embeddings
+
+    # elasticsearch
+    ELASTIC_CLOUD_ID: Optional[str] = None # cloud id for elasticsearch
+    ELASTIC_USERNAME: Optional[str] = None # username for elasticsearch
+    ELASTIC_PASSWORD: Optional[str] = None # password for elasticsearch
+    ELASTIC_URL: Optional[str] = None # url for elasticsearch
+    ELASTIC_INDEX: Optional[str] = "docsgpt" # index name for elasticsearch
+
+    # SageMaker config
+    SAGEMAKER_ENDPOINT: Optional[str] = None # SageMaker endpoint name
+    SAGEMAKER_REGION: Optional[str] = None # SageMaker region name
+    SAGEMAKER_ACCESS_KEY: Optional[str] = None # SageMaker access key
+    SAGEMAKER_SECRET_KEY: Optional[str] = None # SageMaker secret key
+
+
+path = Path(__file__).parent.parent.absolute()
+settings = Settings(_env_file=path.joinpath(".env"), _env_file_encoding="utf-8")
diff --git a/error.py b/error.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d42f0ee412e5ec68b751fd8bf81c63bb9d3de3c
--- /dev/null
+++ b/error.py
@@ -0,0 +1,15 @@
+from flask import jsonify
+from werkzeug.http import HTTP_STATUS_CODES
+
+
+def response_error(code_status, message=None):
+    payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")}
+    if message:
+        payload['message'] = message
+    response = jsonify(payload)
+    response.status_code = code_status
+    return response
+
+
+def bad_request(status_code=400, message=''):
+    return response_error(code_status=status_code, message=message)
diff --git a/index.faiss b/index.faiss
new file mode 100644
index 0000000000000000000000000000000000000000..4e3784e9cd1f2594e882a8db28e010110fb43386
Binary files /dev/null and b/index.faiss differ
diff --git a/index.pkl b/index.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..304ee3181bbea0793355c78913eead0181d6f446
--- /dev/null
+++ b/index.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1653826159295b5a262df5228ec9678a919a9fcc3ff94248eeaa55f434c071ef
+size 7866
diff --git a/indexes/local/patil2016.pdf/index.faiss b/indexes/local/patil2016.pdf/index.faiss
new file mode 100644
index 0000000000000000000000000000000000000000..6d4f04266c6c377f3b5d8a0de6a31c83f68b1a73
Binary files /dev/null and b/indexes/local/patil2016.pdf/index.faiss differ
diff --git a/indexes/local/patil2016.pdf/index.pkl b/indexes/local/patil2016.pdf/index.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..5a23a4ef29b5fb62ff9edfd25cbcb98c7e3da24b
--- /dev/null
+++ b/indexes/local/patil2016.pdf/index.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccc1aa0edd32b66234b113edba42b67f5fc498851e584863124f44abf3920273
+size 28255
diff --git a/inputs/local/patil2016.pdf/patil2016.pdf b/inputs/local/patil2016.pdf/patil2016.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..fce6d78d67c3f1260ee1155e4b60f326bb26e5ea
Binary files /dev/null and b/inputs/local/patil2016.pdf/patil2016.pdf differ
diff --git a/llm/__init__.py b/llm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llm/__pycache__/__init__.cpython-310.pyc b/llm/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf95a2c895ccec491633590295ec5b840b4087d7
Binary files /dev/null and b/llm/__pycache__/__init__.cpython-310.pyc differ
diff --git a/llm/__pycache__/anthropic.cpython-310.pyc b/llm/__pycache__/anthropic.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0d12bf1e79a812acf88cde898922184bb68cb81
Binary files /dev/null and b/llm/__pycache__/anthropic.cpython-310.pyc differ
diff --git a/llm/__pycache__/base.cpython-310.pyc b/llm/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7c4d8a394a511ac201a96c81c01c22c0923309e
Binary files /dev/null and b/llm/__pycache__/base.cpython-310.pyc differ
diff --git a/llm/__pycache__/docsgpt_provider.cpython-310.pyc b/llm/__pycache__/docsgpt_provider.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..918460b87abc24a5c3fc25e6b48579126e7f6ded
Binary files /dev/null and b/llm/__pycache__/docsgpt_provider.cpython-310.pyc differ
diff --git a/llm/__pycache__/huggingface.cpython-310.pyc b/llm/__pycache__/huggingface.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9881b678dc75efe05e623badc946d44e8199517
Binary files /dev/null and b/llm/__pycache__/huggingface.cpython-310.pyc differ
diff --git a/llm/__pycache__/llama_cpp.cpython-310.pyc b/llm/__pycache__/llama_cpp.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..485858bdfd5bd3d62d1e06bb828d4a4a9bb6873d
Binary files /dev/null and b/llm/__pycache__/llama_cpp.cpython-310.pyc differ
diff --git a/llm/__pycache__/llm_creator.cpython-310.pyc b/llm/__pycache__/llm_creator.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d75deaaf41cc477f53efa4d3eb39dee4cf3f58ca
Binary files /dev/null and b/llm/__pycache__/llm_creator.cpython-310.pyc differ
diff --git a/llm/__pycache__/openai.cpython-310.pyc b/llm/__pycache__/openai.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca423823f4794bcf61e2b15b975649328b457abe
Binary files /dev/null and b/llm/__pycache__/openai.cpython-310.pyc differ
diff --git a/llm/__pycache__/sagemaker.cpython-310.pyc b/llm/__pycache__/sagemaker.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7eb1403df23815af82fc85c1930b2ef20c94449
Binary files /dev/null and b/llm/__pycache__/sagemaker.cpython-310.pyc differ
diff --git a/llm/anthropic.py b/llm/anthropic.py
new file mode 100644
index 0000000000000000000000000000000000000000..a64d71e937fe5b7e25021222139588faab9e760a
--- /dev/null
+++ b/llm/anthropic.py
@@ -0,0 +1,40 @@
+from application.llm.base import BaseLLM
+from application.core.settings import settings
+
+class AnthropicLLM(BaseLLM):
+
+    def __init__(self, api_key=None):
+        from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
+        self.api_key = api_key or settings.ANTHROPIC_API_KEY  # If not provided, use a default from settings
+        self.anthropic = Anthropic(api_key=self.api_key)
+        self.HUMAN_PROMPT = HUMAN_PROMPT
+        self.AI_PROMPT = AI_PROMPT
+
+    def gen(self, model, messages, engine=None, max_tokens=300, stream=False, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Context \n {context} \n ### Question \n {user_question}"
+        if stream:
+            return self.gen_stream(model, prompt, max_tokens, **kwargs)
+
+        completion = self.anthropic.completions.create(
+            model=model,
+            max_tokens_to_sample=max_tokens,
+            stream=stream,
+            prompt=f"{self.HUMAN_PROMPT} {prompt}{self.AI_PROMPT}",
+        )
+        return completion.completion
+
+    def gen_stream(self, model, messages, engine=None, max_tokens=300, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Context \n {context} \n ### Question \n {user_question}"
+        stream_response = self.anthropic.completions.create(
+            model=model,
+            prompt=f"{self.HUMAN_PROMPT} {prompt}{self.AI_PROMPT}",
+            max_tokens_to_sample=max_tokens,
+            stream=True,
+        )
+
+        for completion in stream_response:
+            yield completion.completion
\ No newline at end of file
diff --git a/llm/base.py b/llm/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e08a3b090950d55d35a749a117ec29f4e00e5d4e
--- /dev/null
+++ b/llm/base.py
@@ -0,0 +1,14 @@
+from abc import ABC, abstractmethod
+
+
+class BaseLLM(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def gen(self, *args, **kwargs):
+        pass
+
+    @abstractmethod
+    def gen_stream(self, *args, **kwargs):
+        pass
diff --git a/llm/docsgpt_provider.py b/llm/docsgpt_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7d6a5ad570902c36809c5e5fedd0f390c376922
--- /dev/null
+++ b/llm/docsgpt_provider.py
@@ -0,0 +1,49 @@
+from application.llm.base import BaseLLM
+import json
+import requests
+
+class DocsGPTAPILLM(BaseLLM):
+
+    def __init__(self, *args, **kwargs):
+        self.endpoint =  "https://llm.docsgpt.co.uk"
+
+
+    def gen(self, model, engine, messages, stream=False, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
+
+        response = requests.post(
+            f"{self.endpoint}/answer",
+            json={
+                "prompt": prompt,
+                "max_new_tokens": 30
+            }
+        )
+        response_clean = response.json()['a'].split("###")[0]
+
+        return response_clean
+
+    def gen_stream(self, model, engine, messages, stream=True, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
+
+        # send prompt to endpoint /stream
+        response = requests.post(
+            f"{self.endpoint}/stream",
+            json={
+                "prompt": prompt,
+                "max_new_tokens": 256
+            },
+            stream=True
+        )
+    
+        for line in response.iter_lines():
+            if line:
+                #data = json.loads(line)
+                data_str = line.decode('utf-8')
+                if data_str.startswith("data: "):
+                    data = json.loads(data_str[6:])
+                    yield data['a']
+                    
\ No newline at end of file
diff --git a/llm/huggingface.py b/llm/huggingface.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef3b1fbceeeb01e62c94077c477ead555a18eb06
--- /dev/null
+++ b/llm/huggingface.py
@@ -0,0 +1,44 @@
+from application.llm.base import BaseLLM
+
+class HuggingFaceLLM(BaseLLM):
+
+    def __init__(self, api_key, llm_name='Arc53/DocsGPT-7B',q=False):
+        global hf
+        
+        from langchain.llms import HuggingFacePipeline
+        if q:
+            import torch
+            from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
+            tokenizer = AutoTokenizer.from_pretrained(llm_name)
+            bnb_config = BitsAndBytesConfig(
+                            load_in_4bit=True,
+                            bnb_4bit_use_double_quant=True,
+                            bnb_4bit_quant_type="nf4",
+                            bnb_4bit_compute_dtype=torch.bfloat16
+                        )
+            model = AutoModelForCausalLM.from_pretrained(llm_name,quantization_config=bnb_config)
+        else:
+            from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+            tokenizer = AutoTokenizer.from_pretrained(llm_name)
+            model = AutoModelForCausalLM.from_pretrained(llm_name)
+        
+        pipe = pipeline(
+            "text-generation", model=model,
+            tokenizer=tokenizer, max_new_tokens=2000,
+            device_map="auto", eos_token_id=tokenizer.eos_token_id
+        )
+        hf = HuggingFacePipeline(pipeline=pipe)
+
+    def gen(self, model, engine, messages, stream=False, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
+
+        result = hf(prompt)
+
+        return result.content
+
+    def gen_stream(self, model, engine, messages, stream=True, **kwargs):
+
+        raise NotImplementedError("HuggingFaceLLM Streaming is not implemented yet.")
+
diff --git a/llm/llama_cpp.py b/llm/llama_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..f18d4379ee6dadc5a394f320a2e88c58994cd488
--- /dev/null
+++ b/llm/llama_cpp.py
@@ -0,0 +1,39 @@
+from application.llm.base import BaseLLM
+from application.core.settings import settings
+
+class LlamaCpp(BaseLLM):
+
+    def __init__(self, api_key, llm_name=settings.MODEL_PATH, **kwargs):
+        global llama
+        try:
+            from llama_cpp import Llama
+        except ImportError:
+            raise ImportError("Please install llama_cpp using pip install llama-cpp-python")
+
+        llama = Llama(model_path=llm_name, n_ctx=2048)
+
+    def gen(self, model, engine, messages, stream=False, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
+
+        result = llama(prompt, max_tokens=150, echo=False)
+
+        # import sys
+        # print(result['choices'][0]['text'].split('### Answer \n')[-1], file=sys.stderr)
+        
+        return result['choices'][0]['text'].split('### Answer \n')[-1]
+
+    def gen_stream(self, model, engine, messages, stream=True, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
+
+        result = llama(prompt, max_tokens=150, echo=False, stream=stream)
+
+        # import sys
+        # print(list(result), file=sys.stderr)
+
+        for item in result:
+            for choice in item['choices']:
+                yield choice['text']
diff --git a/llm/llm_creator.py b/llm/llm_creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0d6ae3f8d8d218627d66f043f574e386346fc3f
--- /dev/null
+++ b/llm/llm_creator.py
@@ -0,0 +1,26 @@
+from application.llm.openai import OpenAILLM, AzureOpenAILLM
+from application.llm.sagemaker import SagemakerAPILLM
+from application.llm.huggingface import HuggingFaceLLM
+from application.llm.llama_cpp import LlamaCpp
+from application.llm.anthropic import AnthropicLLM
+from application.llm.docsgpt_provider import DocsGPTAPILLM
+
+
+
+class LLMCreator:
+    llms = {
+        'openai': OpenAILLM,
+        'azure_openai': AzureOpenAILLM,
+        'sagemaker': SagemakerAPILLM,
+        'huggingface': HuggingFaceLLM,
+        'llama.cpp': LlamaCpp,
+        'anthropic': AnthropicLLM,
+        'docsgpt': DocsGPTAPILLM
+    }
+
+    @classmethod
+    def create_llm(cls, type, *args, **kwargs):
+        llm_class = cls.llms.get(type.lower())
+        if not llm_class:
+            raise ValueError(f"No LLM class found for type {type}")
+        return llm_class(*args, **kwargs)
\ No newline at end of file
diff --git a/llm/openai.py b/llm/openai.py
new file mode 100644
index 0000000000000000000000000000000000000000..a132399aacdf899d44ed8de55a37ac31ecc23a5e
--- /dev/null
+++ b/llm/openai.py
@@ -0,0 +1,60 @@
+from application.llm.base import BaseLLM
+from application.core.settings import settings
+
+class OpenAILLM(BaseLLM):
+
+    def __init__(self, api_key):
+        global openai
+        from openai import OpenAI
+        
+        self.client = OpenAI(
+                api_key=api_key, 
+            )
+        self.api_key = api_key
+
+    def _get_openai(self):
+        # Import openai when needed
+        import openai
+        
+        return openai
+
+    def gen(self, model, engine, messages, stream=False, **kwargs):
+        response = self.client.chat.completions.create(model=model,
+            messages=messages,
+            stream=stream,
+            **kwargs)
+
+        return response.choices[0].message.content
+
+    def gen_stream(self, model, engine, messages, stream=True, **kwargs):
+        response = self.client.chat.completions.create(model=model,
+            messages=messages,
+            stream=stream,
+            **kwargs)
+
+        for line in response:
+            # import sys
+            # print(line.choices[0].delta.content, file=sys.stderr)
+            if line.choices[0].delta.content is not None:
+                yield line.choices[0].delta.content
+
+
+class AzureOpenAILLM(OpenAILLM):
+
+    def __init__(self, openai_api_key, openai_api_base, openai_api_version, deployment_name):
+        super().__init__(openai_api_key)
+        self.api_base = settings.OPENAI_API_BASE,
+        self.api_version = settings.OPENAI_API_VERSION,
+        self.deployment_name = settings.AZURE_DEPLOYMENT_NAME,
+        from openai import AzureOpenAI
+        self.client = AzureOpenAI(
+            api_key=openai_api_key,  
+            api_version=settings.OPENAI_API_VERSION,
+            api_base=settings.OPENAI_API_BASE,
+            deployment_name=settings.AZURE_DEPLOYMENT_NAME,
+        )
+
+    def _get_openai(self):
+        openai = super()._get_openai()
+
+        return openai
diff --git a/llm/sagemaker.py b/llm/sagemaker.py
new file mode 100644
index 0000000000000000000000000000000000000000..84ae09adf99647c477f31268abafc3ffb6c16850
--- /dev/null
+++ b/llm/sagemaker.py
@@ -0,0 +1,139 @@
+from application.llm.base import BaseLLM
+from application.core.settings import settings
+import json
+import io
+
+
+
+class LineIterator:
+    """
+    A helper class for parsing the byte stream input. 
+    
+    The output of the model will be in the following format:
+    ```
+    b'{"outputs": [" a"]}\n'
+    b'{"outputs": [" challenging"]}\n'
+    b'{"outputs": [" problem"]}\n'
+    ...
+    ```
+    
+    While usually each PayloadPart event from the event stream will contain a byte array 
+    with a full json, this is not guaranteed and some of the json objects may be split across
+    PayloadPart events. For example:
+    ```
+    {'PayloadPart': {'Bytes': b'{"outputs": '}}
+    {'PayloadPart': {'Bytes': b'[" problem"]}\n'}}
+    ```
+    
+    This class accounts for this by concatenating bytes written via the 'write' function
+    and then exposing a method which will return lines (ending with a '\n' character) within
+    the buffer via the 'scan_lines' function. It maintains the position of the last read 
+    position to ensure that previous bytes are not exposed again. 
+    """
+    
+    def __init__(self, stream):
+        self.byte_iterator = iter(stream)
+        self.buffer = io.BytesIO()
+        self.read_pos = 0
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        while True:
+            self.buffer.seek(self.read_pos)
+            line = self.buffer.readline()
+            if line and line[-1] == ord('\n'):
+                self.read_pos += len(line)
+                return line[:-1]
+            try:
+                chunk = next(self.byte_iterator)
+            except StopIteration:
+                if self.read_pos < self.buffer.getbuffer().nbytes:
+                    continue
+                raise
+            if 'PayloadPart' not in chunk:
+                print('Unknown event type:' + chunk)
+                continue
+            self.buffer.seek(0, io.SEEK_END)
+            self.buffer.write(chunk['PayloadPart']['Bytes'])
+
+class SagemakerAPILLM(BaseLLM):
+
+    def __init__(self, *args, **kwargs):
+        import boto3
+        runtime = boto3.client(
+            'runtime.sagemaker',
+            aws_access_key_id='xxx',
+            aws_secret_access_key='xxx',
+            region_name='us-west-2'
+        )
+
+        
+        self.endpoint =  settings.SAGEMAKER_ENDPOINT
+        self.runtime = runtime
+
+
+    def gen(self, model, engine, messages, stream=False, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
+    
+
+        # Construct payload for endpoint
+        payload = {
+            "inputs": prompt,
+            "stream": False,
+            "parameters": {
+                "do_sample": True,
+                "temperature": 0.1,
+                "max_new_tokens": 30,
+                "repetition_penalty": 1.03,
+                "stop": ["</s>", "###"]
+            }
+        }
+        body_bytes = json.dumps(payload).encode('utf-8')
+
+        # Invoke the endpoint
+        response = self.runtime.invoke_endpoint(EndpointName=self.endpoint,
+                                        ContentType='application/json',
+                                        Body=body_bytes)
+        result = json.loads(response['Body'].read().decode())
+        import sys
+        print(result[0]['generated_text'], file=sys.stderr)
+        return result[0]['generated_text'][len(prompt):]
+
+    def gen_stream(self, model, engine, messages, stream=True, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
+    
+
+        # Construct payload for endpoint
+        payload = {
+            "inputs": prompt,
+            "stream": True,
+            "parameters": {
+                "do_sample": True,
+                "temperature": 0.1,
+                "max_new_tokens": 512,
+                "repetition_penalty": 1.03,
+                "stop": ["</s>", "###"]
+            }
+        }
+        body_bytes = json.dumps(payload).encode('utf-8')
+
+        # Invoke the endpoint
+        response = self.runtime.invoke_endpoint_with_response_stream(EndpointName=self.endpoint,
+                                        ContentType='application/json',
+                                        Body=body_bytes)
+        #result = json.loads(response['Body'].read().decode())
+        event_stream = response['Body']
+        start_json = b'{'
+        for line in LineIterator(event_stream):
+            if line != b'' and start_json in line:
+                #print(line)
+                data = json.loads(line[line.find(start_json):].decode('utf-8'))
+                if data['token']['text'] not in ["</s>", "###"]:
+                    print(data['token']['text'],end='')
+                    yield data['token']['text']
\ No newline at end of file
diff --git a/parser/__init__.py b/parser/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/parser/__init__.py
@@ -0,0 +1 @@
+
diff --git a/parser/__pycache__/__init__.cpython-310.pyc b/parser/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d7b8485471e114891e5b1e58b1a48421b72a1e6
Binary files /dev/null and b/parser/__pycache__/__init__.cpython-310.pyc differ
diff --git a/parser/__pycache__/open_ai_func.cpython-310.pyc b/parser/__pycache__/open_ai_func.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..167143959e69e6b1324a0f16e9925adccda6d84b
Binary files /dev/null and b/parser/__pycache__/open_ai_func.cpython-310.pyc differ
diff --git a/parser/__pycache__/token_func.cpython-310.pyc b/parser/__pycache__/token_func.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b33a007c639722650171b4ef90f7cc449d2e930b
Binary files /dev/null and b/parser/__pycache__/token_func.cpython-310.pyc differ
diff --git a/parser/file/__init__.py b/parser/file/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/parser/file/__init__.py
@@ -0,0 +1 @@
+
diff --git a/parser/file/__pycache__/__init__.cpython-310.pyc b/parser/file/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e3267074255ccff8679245c2768a70080a3699b8
Binary files /dev/null and b/parser/file/__pycache__/__init__.cpython-310.pyc differ
diff --git a/parser/file/__pycache__/base.cpython-310.pyc b/parser/file/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d758673efacb4cf0ac063108bd5ba63cf42cfbee
Binary files /dev/null and b/parser/file/__pycache__/base.cpython-310.pyc differ
diff --git a/parser/file/__pycache__/base_parser.cpython-310.pyc b/parser/file/__pycache__/base_parser.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c6cd9521184d12059d0b565cad01b365c9dabf4
Binary files /dev/null and b/parser/file/__pycache__/base_parser.cpython-310.pyc differ
diff --git a/parser/file/__pycache__/bulk.cpython-310.pyc b/parser/file/__pycache__/bulk.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ccee0317291b96f53ef271db926501010d57d59b
Binary files /dev/null and b/parser/file/__pycache__/bulk.cpython-310.pyc differ
diff --git a/parser/file/__pycache__/docs_parser.cpython-310.pyc b/parser/file/__pycache__/docs_parser.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70684d8a015fecec8ab6ac76eda6a9b7a2526678
Binary files /dev/null and b/parser/file/__pycache__/docs_parser.cpython-310.pyc differ
diff --git a/parser/file/__pycache__/epub_parser.cpython-310.pyc b/parser/file/__pycache__/epub_parser.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28e629b1e75cd5bb18a0ce91e28b5aea0bb80116
Binary files /dev/null and b/parser/file/__pycache__/epub_parser.cpython-310.pyc differ
diff --git a/parser/file/__pycache__/html_parser.cpython-310.pyc b/parser/file/__pycache__/html_parser.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ccb70083c45807d9e7b488b779fa1befdfce3217
Binary files /dev/null and b/parser/file/__pycache__/html_parser.cpython-310.pyc differ
diff --git a/parser/file/__pycache__/markdown_parser.cpython-310.pyc b/parser/file/__pycache__/markdown_parser.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ee52f8bb3b4d7f2f40a9689176adb35c459a169
Binary files /dev/null and b/parser/file/__pycache__/markdown_parser.cpython-310.pyc differ
diff --git a/parser/file/__pycache__/rst_parser.cpython-310.pyc b/parser/file/__pycache__/rst_parser.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b4bc7dd426c5fbfdc5255aa776d6476fdb96321
Binary files /dev/null and b/parser/file/__pycache__/rst_parser.cpython-310.pyc differ
diff --git a/parser/file/__pycache__/tabular_parser.cpython-310.pyc b/parser/file/__pycache__/tabular_parser.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f024193bfb916a3b6f725a3ecb997cf1e1567037
Binary files /dev/null and b/parser/file/__pycache__/tabular_parser.cpython-310.pyc differ
diff --git a/parser/file/base.py b/parser/file/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f63e8ef64d21208d886c4c6d52c1d6e097609e3e
--- /dev/null
+++ b/parser/file/base.py
@@ -0,0 +1,19 @@
+"""Base reader class."""
+from abc import abstractmethod
+from typing import Any, List
+
+from langchain.docstore.document import Document as LCDocument
+from application.parser.schema.base import Document
+
+
+class BaseReader:
+    """Utilities for loading data from a directory."""
+
+    @abstractmethod
+    def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
+        """Load data from the input directory."""
+
+    def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
+        """Load data in LangChain document format."""
+        docs = self.load_data(**load_kwargs)
+        return [d.to_langchain_format() for d in docs]
diff --git a/parser/file/base_parser.py b/parser/file/base_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..753a56f9797432f053fb96a72bdb782a2b20bd05
--- /dev/null
+++ b/parser/file/base_parser.py
@@ -0,0 +1,38 @@
+"""Base parser and config class."""
+
+from abc import abstractmethod
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+
+class BaseParser:
+    """Base class for all parsers."""
+
+    def __init__(self, parser_config: Optional[Dict] = None):
+        """Init params."""
+        self._parser_config = parser_config
+
+    def init_parser(self) -> None:
+        """Init parser and store it."""
+        parser_config = self._init_parser()
+        self._parser_config = parser_config
+
+    @property
+    def parser_config_set(self) -> bool:
+        """Check if parser config is set."""
+        return self._parser_config is not None
+
+    @property
+    def parser_config(self) -> Dict:
+        """Check if parser config is set."""
+        if self._parser_config is None:
+            raise ValueError("Parser config not set.")
+        return self._parser_config
+
+    @abstractmethod
+    def _init_parser(self) -> Dict:
+        """Initialize the parser with the config."""
+
+    @abstractmethod
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse file."""
diff --git a/parser/file/bulk.py b/parser/file/bulk.py
new file mode 100644
index 0000000000000000000000000000000000000000..593681e29b2bf109cfe5e5cd21476d71e1e44412
--- /dev/null
+++ b/parser/file/bulk.py
@@ -0,0 +1,163 @@
+"""Simple reader that reads files of different formats from a directory."""
+import logging
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Union
+
+from application.parser.file.base import BaseReader
+from application.parser.file.base_parser import BaseParser
+from application.parser.file.docs_parser import DocxParser, PDFParser
+from application.parser.file.epub_parser import EpubParser
+from application.parser.file.html_parser import HTMLParser
+from application.parser.file.markdown_parser import MarkdownParser
+from application.parser.file.rst_parser import RstParser
+from application.parser.file.tabular_parser import PandasCSVParser
+from application.parser.schema.base import Document
+
+DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
+    ".pdf": PDFParser(),
+    ".docx": DocxParser(),
+    ".csv": PandasCSVParser(),
+    ".epub": EpubParser(),
+    ".md": MarkdownParser(),
+    ".rst": RstParser(),
+    ".html": HTMLParser(),
+    ".mdx": MarkdownParser(),
+}
+
+
+class SimpleDirectoryReader(BaseReader):
+    """Simple directory reader.
+
+    Can read files into separate documents, or concatenates
+    files into one document text.
+
+    Args:
+        input_dir (str): Path to the directory.
+        input_files (List): List of file paths to read (Optional; overrides input_dir)
+        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).
+        errors (str): how encoding and decoding errors are to be handled,
+              see https://docs.python.org/3/library/functions.html#open
+        recursive (bool): Whether to recursively search in subdirectories.
+            False by default.
+        required_exts (Optional[List[str]]): List of required extensions.
+            Default is None.
+        file_extractor (Optional[Dict[str, BaseParser]]): A mapping of file
+            extension to a BaseParser class that specifies how to convert that file
+            to text. See DEFAULT_FILE_EXTRACTOR.
+        num_files_limit (Optional[int]): Maximum number of files to read.
+            Default is None.
+        file_metadata (Optional[Callable[str, Dict]]): A function that takes
+            in a filename and returns a Dict of metadata for the Document.
+            Default is None.
+    """
+
+    def __init__(
+            self,
+            input_dir: Optional[str] = None,
+            input_files: Optional[List] = None,
+            exclude_hidden: bool = True,
+            errors: str = "ignore",
+            recursive: bool = True,
+            required_exts: Optional[List[str]] = None,
+            file_extractor: Optional[Dict[str, BaseParser]] = None,
+            num_files_limit: Optional[int] = None,
+            file_metadata: Optional[Callable[[str], Dict]] = None,
+            chunk_size_max: int = 2048,
+    ) -> None:
+        """Initialize with parameters."""
+        super().__init__()
+
+        if not input_dir and not input_files:
+            raise ValueError("Must provide either `input_dir` or `input_files`.")
+
+        self.errors = errors
+
+        self.recursive = recursive
+        self.exclude_hidden = exclude_hidden
+        self.required_exts = required_exts
+        self.num_files_limit = num_files_limit
+
+        if input_files:
+            self.input_files = []
+            for path in input_files:
+                print(path)
+                input_file = Path(path)
+                self.input_files.append(input_file)
+        elif input_dir:
+            self.input_dir = Path(input_dir)
+            self.input_files = self._add_files(self.input_dir)
+
+        self.file_extractor = file_extractor or DEFAULT_FILE_EXTRACTOR
+        self.file_metadata = file_metadata
+
+    def _add_files(self, input_dir: Path) -> List[Path]:
+        """Add files."""
+        input_files = sorted(input_dir.iterdir())
+        new_input_files = []
+        dirs_to_explore = []
+        for input_file in input_files:
+            if input_file.is_dir():
+                if self.recursive:
+                    dirs_to_explore.append(input_file)
+            elif self.exclude_hidden and input_file.name.startswith("."):
+                continue
+            elif (
+                    self.required_exts is not None
+                    and input_file.suffix not in self.required_exts
+            ):
+                continue
+            else:
+                new_input_files.append(input_file)
+
+        for dir_to_explore in dirs_to_explore:
+            sub_input_files = self._add_files(dir_to_explore)
+            new_input_files.extend(sub_input_files)
+
+        if self.num_files_limit is not None and self.num_files_limit > 0:
+            new_input_files = new_input_files[0: self.num_files_limit]
+
+        # print total number of files added
+        logging.debug(
+            f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}"
+        )
+
+        return new_input_files
+
+    def load_data(self, concatenate: bool = False) -> List[Document]:
+        """Load data from the input directory.
+
+        Args:
+            concatenate (bool): whether to concatenate all files into one document.
+                If set to True, file metadata is ignored.
+                False by default.
+
+        Returns:
+            List[Document]: A list of documents.
+
+        """
+        data: Union[str, List[str]] = ""
+        data_list: List[str] = []
+        metadata_list = []
+        for input_file in self.input_files:
+            if input_file.suffix in self.file_extractor:
+                parser = self.file_extractor[input_file.suffix]
+                if not parser.parser_config_set:
+                    parser.init_parser()
+                data = parser.parse_file(input_file, errors=self.errors)
+            else:
+                # do standard read
+                with open(input_file, "r", errors=self.errors) as f:
+                    data = f.read()
+            if isinstance(data, List):
+                data_list.extend(data)
+            else:
+                data_list.append(str(data))
+            if self.file_metadata is not None:
+                metadata_list.append(self.file_metadata(str(input_file)))
+
+        if concatenate:
+            return [Document("\n".join(data_list))]
+        elif self.file_metadata is not None:
+            return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)]
+        else:
+            return [Document(d) for d in data_list]
diff --git a/parser/file/docs_parser.py b/parser/file/docs_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..861e8e5899935049f460799abbafd60c13164e34
--- /dev/null
+++ b/parser/file/docs_parser.py
@@ -0,0 +1,59 @@
+"""Docs parser.
+
+Contains parsers for docx, pdf files.
+
+"""
+from pathlib import Path
+from typing import Dict
+
+from application.parser.file.base_parser import BaseParser
+
+
+class PDFParser(BaseParser):
+    """PDF parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> str:
+        """Parse file."""
+        try:
+            import PyPDF2
+        except ImportError:
+            raise ValueError("PyPDF2 is required to read PDF files.")
+        text_list = []
+        with open(file, "rb") as fp:
+            # Create a PDF object
+            pdf = PyPDF2.PdfReader(fp)
+
+            # Get the number of pages in the PDF document
+            num_pages = len(pdf.pages)
+
+            # Iterate over every page
+            for page in range(num_pages):
+                # Extract the text from the page
+                page_text = pdf.pages[page].extract_text()
+                text_list.append(page_text)
+        text = "\n".join(text_list)
+
+        return text
+
+
+class DocxParser(BaseParser):
+    """Docx parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> str:
+        """Parse file."""
+        try:
+            import docx2txt
+        except ImportError:
+            raise ValueError("docx2txt is required to read Microsoft Word files.")
+
+        text = docx2txt.process(file)
+
+        return text
diff --git a/parser/file/epub_parser.py b/parser/file/epub_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f5e87115a082981328adcb838c0a361fd4733bf
--- /dev/null
+++ b/parser/file/epub_parser.py
@@ -0,0 +1,43 @@
+"""Epub parser.
+
+Contains parsers for epub files.
+"""
+
+from pathlib import Path
+from typing import Dict
+
+from application.parser.file.base_parser import BaseParser
+
+
+class EpubParser(BaseParser):
+    """Epub Parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> str:
+        """Parse file."""
+        try:
+            import ebooklib
+            from ebooklib import epub
+        except ImportError:
+            raise ValueError("`EbookLib` is required to read Epub files.")
+        try:
+            import html2text
+        except ImportError:
+            raise ValueError("`html2text` is required to parse Epub files.")
+
+        text_list = []
+        book = epub.read_epub(file, options={"ignore_ncx": True})
+
+        # Iterate through all chapters.
+        for item in book.get_items():
+            # Chapters are typically located in epub documents items.
+            if item.get_type() == ebooklib.ITEM_DOCUMENT:
+                text_list.append(
+                    html2text.html2text(item.get_content().decode("utf-8"))
+                )
+
+        text = "\n".join(text_list)
+        return text
diff --git a/parser/file/html_parser.py b/parser/file/html_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a74a97a53a67f9eeb26d621ef2ff690a1315c7b
--- /dev/null
+++ b/parser/file/html_parser.py
@@ -0,0 +1,83 @@
+"""HTML parser.
+
+Contains parser for html files.
+
+"""
+import re
+from pathlib import Path
+from typing import Dict, Union
+
+from application.parser.file.base_parser import BaseParser
+
+
+class HTMLParser(BaseParser):
+    """HTML parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
+        """Parse file.
+
+            Returns:
+            Union[str, List[str]]: a string or a List of strings.
+        """
+        try:
+            from unstructured.partition.html import partition_html
+            from unstructured.staging.base import convert_to_isd
+            from unstructured.cleaners.core import clean
+        except ImportError:
+            raise ValueError("unstructured package is required to parse HTML files.")
+
+        # Using the unstructured library to convert the html to isd format
+        # isd sample : isd = [
+        #   {"text": "My Title", "type": "Title"},
+        #   {"text": "My Narrative", "type": "NarrativeText"}
+        # ]
+        with open(file, "r", encoding="utf-8") as fp:
+            elements = partition_html(file=fp)
+            isd = convert_to_isd(elements)
+
+            # Removing non ascii charactwers from isd_el['text']
+        for isd_el in isd:
+            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
+
+        # Removing all the \n characters from isd_el['text'] using regex and replace with single space
+        # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
+        for isd_el in isd:
+            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
+            isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
+
+        # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
+        for isd_el in isd:
+            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
+
+        # Creating a list of all the indexes of isd_el['type'] = 'Title'
+        title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
+
+        # Creating 'Chunks' - List of lists of strings 
+        # each list starting with isd_el['type'] = 'Title' and all the data till the next 'Title'
+        # Each Chunk can be thought of as an individual set of data, which can be sent to the model
+        # Where Each Title is grouped together with the data under it
+
+        Chunks = [[]]
+        final_chunks = list(list())
+
+        for i, isd_el in enumerate(isd):
+            if i in title_indexes:
+                Chunks.append([])
+            Chunks[-1].append(isd_el['text'])
+
+        # Removing all the chunks with sum of length of all the strings in the chunk < 25
+        # TODO: This value can be an user defined variable
+        for chunk in Chunks:
+            # sum of length of all the strings in the chunk
+            sum = 0
+            sum += len(str(chunk))
+            if sum < 25:
+                Chunks.remove(chunk)
+            else:
+                # appending all the approved chunks to final_chunks as a single string       
+                final_chunks.append(" ".join([str(item) for item in chunk]))
+        return final_chunks
diff --git a/parser/file/markdown_parser.py b/parser/file/markdown_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..d906e9b6c1d8fe41386e39b8ac7cb20f358e8501
--- /dev/null
+++ b/parser/file/markdown_parser.py
@@ -0,0 +1,145 @@
+"""Markdown parser.
+
+Contains parser for md files.
+
+"""
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
+
+import tiktoken
+from application.parser.file.base_parser import BaseParser
+
+
+class MarkdownParser(BaseParser):
+    """Markdown parser.
+
+    Extract text from markdown files.
+    Returns dictionary with keys as headers and values as the text between headers.
+
+    """
+
+    def __init__(
+            self,
+            *args: Any,
+            remove_hyperlinks: bool = True,
+            remove_images: bool = True,
+            max_tokens: int = 2048,
+            # remove_tables: bool = True,
+            **kwargs: Any,
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._remove_hyperlinks = remove_hyperlinks
+        self._remove_images = remove_images
+        self._max_tokens = max_tokens
+        # self._remove_tables = remove_tables
+
+    def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
+                          current_text: str):
+        """Append to tups chunk."""
+        num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
+        if num_tokens > self._max_tokens:
+            chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)]
+            for chunk in chunks:
+                tups.append((current_header, chunk))
+        else:
+            tups.append((current_header, current_text))
+        return tups
+
+    def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
+        """Convert a markdown file to a dictionary.
+
+        The keys are the headers and the values are the text under each header.
+
+        """
+        markdown_tups: List[Tuple[Optional[str], str]] = []
+        lines = markdown_text.split("\n")
+
+        current_header = None
+        current_text = ""
+
+        for line in lines:
+            header_match = re.match(r"^#+\s", line)
+            if header_match:
+                if current_header is not None:
+                    if current_text == "" or None:
+                        continue
+                    markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text)
+
+                current_header = line
+                current_text = ""
+            else:
+                current_text += line + "\n"
+        markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text)
+
+        if current_header is not None:
+            # pass linting, assert keys are defined
+            markdown_tups = [
+                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
+                for key, value in markdown_tups
+            ]
+        else:
+            markdown_tups = [
+                (key, re.sub("\n", "", value)) for key, value in markdown_tups
+            ]
+
+        return markdown_tups
+
+    def remove_images(self, content: str) -> str:
+        """Get a dictionary of a markdown file from its path."""
+        pattern = r"!{1}\[\[(.*)\]\]"
+        content = re.sub(pattern, "", content)
+        return content
+
+    # def remove_tables(self, content: str) -> List[List[str]]:
+    #     """Convert markdown tables to nested lists."""
+    #     table_rows_pattern = r"((\r?\n){2}|^)([^\r\n]*\|[^\r\n]*(\r?\n)?)+(?=(\r?\n){2}|$)"
+    #     table_cells_pattern = r"([^\|\r\n]*)\|"
+    #
+    #     table_rows = re.findall(table_rows_pattern, content, re.MULTILINE)
+    #     table_lists = []
+    #     for row in table_rows:
+    #         cells = re.findall(table_cells_pattern, row[2])
+    #         cells = [cell.strip() for cell in cells if cell.strip()]
+    #         table_lists.append(cells)
+    #     return str(table_lists)
+
+    def remove_hyperlinks(self, content: str) -> str:
+        """Get a dictionary of a markdown file from its path."""
+        pattern = r"\[(.*?)\]\((.*?)\)"
+        content = re.sub(pattern, r"\1", content)
+        return content
+
+    def _init_parser(self) -> Dict:
+        """Initialize the parser with the config."""
+        return {}
+
+    def parse_tups(
+            self, filepath: Path, errors: str = "ignore"
+    ) -> List[Tuple[Optional[str], str]]:
+        """Parse file into tuples."""
+        with open(filepath, "r") as f:
+            content = f.read()
+        if self._remove_hyperlinks:
+            content = self.remove_hyperlinks(content)
+        if self._remove_images:
+            content = self.remove_images(content)
+        # if self._remove_tables:
+        #     content = self.remove_tables(content)
+        markdown_tups = self.markdown_to_tups(content)
+        return markdown_tups
+
+    def parse_file(
+            self, filepath: Path, errors: str = "ignore"
+    ) -> Union[str, List[str]]:
+        """Parse file into string."""
+        tups = self.parse_tups(filepath, errors=errors)
+        results = []
+        # TODO: don't include headers right now
+        for header, value in tups:
+            if header is None:
+                results.append(value)
+            else:
+                results.append(f"\n\n{header}\n{value}")
+        return results
diff --git a/parser/file/openapi3_parser.py b/parser/file/openapi3_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c5082fa278e46eabaacbcf9cc62eb53b6e66162
--- /dev/null
+++ b/parser/file/openapi3_parser.py
@@ -0,0 +1,51 @@
+from urllib.parse import urlparse
+
+from openapi_parser import parse
+
+try:
+    from application.parser.file.base_parser import BaseParser
+except ModuleNotFoundError:
+    from base_parser import BaseParser
+
+
+class OpenAPI3Parser(BaseParser):
+    def init_parser(self) -> None:
+        return super().init_parser()
+
+    def get_base_urls(self, urls):
+        base_urls = []
+        for i in urls:
+            parsed_url = urlparse(i)
+            base_url = parsed_url.scheme + "://" + parsed_url.netloc
+            if base_url not in base_urls:
+                base_urls.append(base_url)
+        return base_urls
+
+    def get_info_from_paths(self, path):
+        info = ""
+        if path.operations:
+            for operation in path.operations:
+                info += (
+                    f"\n{operation.method.value}="
+                    f"{operation.responses[0].description}"
+                )
+        return info
+
+    def parse_file(self, file_path):
+        data = parse(file_path)
+        results = ""
+        base_urls = self.get_base_urls(link.url for link in data.servers)
+        base_urls = ",".join([base_url for base_url in base_urls])
+        results += f"Base URL:{base_urls}\n"
+        i = 1
+        for path in data.paths:
+            info = self.get_info_from_paths(path)
+            results += (
+                f"Path{i}: {path.url}\n"
+                f"description: {path.description}\n"
+                f"parameters: {path.parameters}\nmethods: {info}\n"
+            )
+            i += 1
+        with open("results.txt", "w") as f:
+            f.write(results)
+        return results
diff --git a/parser/file/rst_parser.py b/parser/file/rst_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..633ec844559f1f8aac25822fde58f573ab053684
--- /dev/null
+++ b/parser/file/rst_parser.py
@@ -0,0 +1,173 @@
+"""reStructuredText parser.
+
+Contains parser for md files.
+
+"""
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from application.parser.file.base_parser import BaseParser
+
+
+class RstParser(BaseParser):
+    """reStructuredText parser.
+
+    Extract text from .rst files.
+    Returns dictionary with keys as headers and values as the text between headers.
+
+    """
+
+    def __init__(
+            self,
+            *args: Any,
+            remove_hyperlinks: bool = True,
+            remove_images: bool = True,
+            remove_table_excess: bool = True,
+            remove_interpreters: bool = True,
+            remove_directives: bool = True,
+            remove_whitespaces_excess: bool = True,
+            # Be careful with remove_characters_excess, might cause data loss
+            remove_characters_excess: bool = True,
+            **kwargs: Any,
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._remove_hyperlinks = remove_hyperlinks
+        self._remove_images = remove_images
+        self._remove_table_excess = remove_table_excess
+        self._remove_interpreters = remove_interpreters
+        self._remove_directives = remove_directives
+        self._remove_whitespaces_excess = remove_whitespaces_excess
+        self._remove_characters_excess = remove_characters_excess
+
+    def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
+        """Convert a reStructuredText file to a dictionary.
+
+        The keys are the headers and the values are the text under each header.
+
+        """
+        rst_tups: List[Tuple[Optional[str], str]] = []
+        lines = rst_text.split("\n")
+
+        current_header = None
+        current_text = ""
+
+        for i, line in enumerate(lines):
+            header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
+            if header_match and i > 0 and (
+                    len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
+                if current_header is not None:
+                    if current_text == "" or None:
+                        continue
+                    # removes the next heading from current Document
+                    if current_text.endswith(lines[i - 1] + "\n"):
+                        current_text = current_text[:len(current_text) - len(lines[i - 1] + "\n")]
+                    rst_tups.append((current_header, current_text))
+
+                current_header = lines[i - 1]
+                current_text = ""
+            else:
+                current_text += line + "\n"
+
+        rst_tups.append((current_header, current_text))
+
+        # TODO: Format for rst
+        #
+        # if current_header is not None:
+        #     # pass linting, assert keys are defined
+        #     rst_tups = [
+        #         (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
+        #         for key, value in rst_tups
+        #     ]
+        # else:
+        #     rst_tups = [
+        #         (key, re.sub("\n", "", value)) for key, value in rst_tups
+        #     ]
+
+        if current_header is None:
+            rst_tups = [
+                (key, re.sub("\n", "", value)) for key, value in rst_tups
+            ]
+        return rst_tups
+
+    def remove_images(self, content: str) -> str:
+        pattern = r"\.\. image:: (.*)"
+        content = re.sub(pattern, "", content)
+        return content
+
+    def remove_hyperlinks(self, content: str) -> str:
+        pattern = r"`(.*?) <(.*?)>`_"
+        content = re.sub(pattern, r"\1", content)
+        return content
+
+    def remove_directives(self, content: str) -> str:
+        """Removes reStructuredText Directives"""
+        pattern = r"`\.\.([^:]+)::"
+        content = re.sub(pattern, "", content)
+        return content
+
+    def remove_interpreters(self, content: str) -> str:
+        """Removes reStructuredText Interpreted Text Roles"""
+        pattern = r":(\w+):"
+        content = re.sub(pattern, "", content)
+        return content
+
+    def remove_table_excess(self, content: str) -> str:
+        """Pattern to remove grid table separators"""
+        pattern = r"^\+[-]+\+[-]+\+$"
+        content = re.sub(pattern, "", content, flags=re.MULTILINE)
+        return content
+
+    def remove_whitespaces_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]:
+        """Pattern to match 2 or more consecutive whitespaces"""
+        pattern = r"\s{2,}"
+        content = [(key, re.sub(pattern, "  ", value)) for key, value in content]
+        return content
+
+    def remove_characters_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]:
+        """Pattern to match 2 or more consecutive characters"""
+        pattern = r"(\S)\1{2,}"
+        content = [(key, re.sub(pattern, r"\1\1\1", value, flags=re.MULTILINE)) for key, value in content]
+        return content
+
+    def _init_parser(self) -> Dict:
+        """Initialize the parser with the config."""
+        return {}
+
+    def parse_tups(
+            self, filepath: Path, errors: str = "ignore"
+    ) -> List[Tuple[Optional[str], str]]:
+        """Parse file into tuples."""
+        with open(filepath, "r") as f:
+            content = f.read()
+        if self._remove_hyperlinks:
+            content = self.remove_hyperlinks(content)
+        if self._remove_images:
+            content = self.remove_images(content)
+        if self._remove_table_excess:
+            content = self.remove_table_excess(content)
+        if self._remove_directives:
+            content = self.remove_directives(content)
+        if self._remove_interpreters:
+            content = self.remove_interpreters(content)
+        rst_tups = self.rst_to_tups(content)
+        if self._remove_whitespaces_excess:
+            rst_tups = self.remove_whitespaces_excess(rst_tups)
+        if self._remove_characters_excess:
+            rst_tups = self.remove_characters_excess(rst_tups)
+        return rst_tups
+
+    def parse_file(
+            self, filepath: Path, errors: str = "ignore"
+    ) -> Union[str, List[str]]:
+        """Parse file into string."""
+        tups = self.parse_tups(filepath, errors=errors)
+        results = []
+        # TODO: don't include headers right now
+        for header, value in tups:
+            if header is None:
+                results.append(value)
+            else:
+                results.append(f"\n\n{header}\n{value}")
+        return results
diff --git a/parser/file/tabular_parser.py b/parser/file/tabular_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..81355ae079819412c33e0ba7723b7ab9632fe885
--- /dev/null
+++ b/parser/file/tabular_parser.py
@@ -0,0 +1,115 @@
+"""Tabular parser.
+
+Contains parsers for tabular data files.
+
+"""
+from pathlib import Path
+from typing import Any, Dict, List, Union
+
+from application.parser.file.base_parser import BaseParser
+
+
+class CSVParser(BaseParser):
+    """CSV parser.
+
+    Args:
+        concat_rows (bool): whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each row.
+            True by default.
+
+    """
+
+    def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse file.
+
+        Returns:
+            Union[str, List[str]]: a string or a List of strings.
+
+        """
+        try:
+            import csv
+        except ImportError:
+            raise ValueError("csv module is required to read CSV files.")
+        text_list = []
+        with open(file, "r") as fp:
+            csv_reader = csv.reader(fp)
+            for row in csv_reader:
+                text_list.append(", ".join(row))
+        if self._concat_rows:
+            return "\n".join(text_list)
+        else:
+            return text_list
+
+
+class PandasCSVParser(BaseParser):
+    r"""Pandas-based CSV parser.
+
+    Parses CSVs using the separator detection from Pandas `read_csv`function.
+    If special parameters are required, use the `pandas_config` dict.
+
+    Args:
+        concat_rows (bool): whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each row.
+            True by default.
+
+        col_joiner (str): Separator to use for joining cols per row.
+            Set to ", " by default.
+
+        row_joiner (str): Separator to use for joining each row.
+            Only used when `concat_rows=True`.
+            Set to "\n" by default.
+
+        pandas_config (dict): Options for the `pandas.read_csv` function call.
+            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
+            for more information.
+            Set to empty dict by default, this means pandas will try to figure
+            out the separators, table head, etc. on its own.
+
+    """
+
+    def __init__(
+            self,
+            *args: Any,
+            concat_rows: bool = True,
+            col_joiner: str = ", ",
+            row_joiner: str = "\n",
+            pandas_config: dict = {},
+            **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+        self._col_joiner = col_joiner
+        self._row_joiner = row_joiner
+        self._pandas_config = pandas_config
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse file."""
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ValueError("pandas module is required to read CSV files.")
+
+        df = pd.read_csv(file, **self._pandas_config)
+
+        text_list = df.apply(
+            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+        ).tolist()
+
+        if self._concat_rows:
+            return (self._row_joiner).join(text_list)
+        else:
+            return text_list
diff --git a/parser/java2doc.py b/parser/java2doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a8bfa3a42208560dc54953e3cfbece249d2b92b
--- /dev/null
+++ b/parser/java2doc.py
@@ -0,0 +1,66 @@
+import os
+
+import javalang
+
+
+def find_files(directory):
+    files_list = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.java'):
+                files_list.append(os.path.join(root, file))
+    return files_list
+
+
+def extract_functions(file_path):
+    with open(file_path, "r") as file:
+        java_code = file.read()
+        methods = {}
+        tree = javalang.parse.parse(java_code)
+        for _, node in tree.filter(javalang.tree.MethodDeclaration):
+            method_name = node.name
+            start_line = node.position.line - 1
+            end_line = start_line
+            brace_count = 0
+            for line in java_code.splitlines()[start_line:]:
+                end_line += 1
+                brace_count += line.count("{") - line.count("}")
+                if brace_count == 0:
+                    break
+            method_source_code = "\n".join(java_code.splitlines()[start_line:end_line])
+            methods[method_name] = method_source_code
+    return methods
+
+
+def extract_classes(file_path):
+    with open(file_path, 'r') as file:
+        source_code = file.read()
+        classes = {}
+        tree = javalang.parse.parse(source_code)
+        for class_decl in tree.types:
+            class_name = class_decl.name
+            declarations = []
+            methods = []
+            for field_decl in class_decl.fields:
+                field_name = field_decl.declarators[0].name
+                field_type = field_decl.type.name
+                declarations.append(f"{field_type} {field_name}")
+            for method_decl in class_decl.methods:
+                methods.append(method_decl.name)
+            class_string = "Declarations: " + ", ".join(declarations) + "\n  Method name: " + ", ".join(methods)
+            classes[class_name] = class_string
+    return classes
+
+
+def extract_functions_and_classes(directory):
+    files = find_files(directory)
+    functions_dict = {}
+    classes_dict = {}
+    for file in files:
+        functions = extract_functions(file)
+        if functions:
+            functions_dict[file] = functions
+        classes = extract_classes(file)
+        if classes:
+            classes_dict[file] = classes
+    return functions_dict, classes_dict
diff --git a/parser/js2doc.py b/parser/js2doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dc448123afae253d19d6ce4e2ea5a0575121846
--- /dev/null
+++ b/parser/js2doc.py
@@ -0,0 +1,70 @@
+import os
+
+import escodegen
+import esprima
+
+
+def find_files(directory):
+    files_list = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.js'):
+                files_list.append(os.path.join(root, file))
+    return files_list
+
+
+def extract_functions(file_path):
+    with open(file_path, 'r') as file:
+        source_code = file.read()
+        functions = {}
+        tree = esprima.parseScript(source_code)
+        for node in tree.body:
+            if node.type == 'FunctionDeclaration':
+                func_name = node.id.name if node.id else '<anonymous>'
+                functions[func_name] = escodegen.generate(node)
+            elif node.type == 'VariableDeclaration':
+                for declaration in node.declarations:
+                    if declaration.init and declaration.init.type == 'FunctionExpression':
+                        func_name = declaration.id.name if declaration.id else '<anonymous>'
+                        functions[func_name] = escodegen.generate(declaration.init)
+            elif node.type == 'ClassDeclaration':
+                for subnode in node.body.body:
+                    if subnode.type == 'MethodDefinition':
+                        func_name = subnode.key.name
+                        functions[func_name] = escodegen.generate(subnode.value)
+                    elif subnode.type == 'VariableDeclaration':
+                        for declaration in subnode.declarations:
+                            if declaration.init and declaration.init.type == 'FunctionExpression':
+                                func_name = declaration.id.name if declaration.id else '<anonymous>'
+                                functions[func_name] = escodegen.generate(declaration.init)
+        return functions
+
+
+def extract_classes(file_path):
+    with open(file_path, 'r') as file:
+        source_code = file.read()
+        classes = {}
+        tree = esprima.parseScript(source_code)
+        for node in tree.body:
+            if node.type == 'ClassDeclaration':
+                class_name = node.id.name
+                function_names = []
+                for subnode in node.body.body:
+                    if subnode.type == 'MethodDefinition':
+                        function_names.append(subnode.key.name)
+                classes[class_name] = ", ".join(function_names)
+    return classes
+
+
+def extract_functions_and_classes(directory):
+    files = find_files(directory)
+    functions_dict = {}
+    classes_dict = {}
+    for file in files:
+        functions = extract_functions(file)
+        if functions:
+            functions_dict[file] = functions
+        classes = extract_classes(file)
+        if classes:
+            classes_dict[file] = classes
+    return functions_dict, classes_dict
diff --git a/parser/open_ai_func.py b/parser/open_ai_func.py
new file mode 100644
index 0000000000000000000000000000000000000000..ede635a857fbf33237372fe0c520efec157d7d11
--- /dev/null
+++ b/parser/open_ai_func.py
@@ -0,0 +1,94 @@
+import os
+
+import tiktoken
+from application.vectorstore.vector_creator import VectorCreator
+from application.core.settings import settings
+from retry import retry
+
+
+# from langchain.embeddings import HuggingFaceEmbeddings
+# from langchain.embeddings import HuggingFaceInstructEmbeddings
+# from langchain.embeddings import CohereEmbeddings
+
+
+def num_tokens_from_string(string: str, encoding_name: str) -> int:
+    # Function to convert string to tokens and estimate user cost.
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    total_price = ((num_tokens / 1000) * 0.0004)
+    return num_tokens, total_price
+
+
+@retry(tries=10, delay=60)
+def store_add_texts_with_retry(store, i):
+    store.add_texts([i.page_content], metadatas=[i.metadata])
+    # store_pine.add_texts([i.page_content], metadatas=[i.metadata])
+
+
+def call_openai_api(docs, folder_name, task_status):
+    # Function to create a vector store from the documents and save it to disk.
+
+    # create output folder if it doesn't exist
+    if not os.path.exists(f"{folder_name}"):
+        os.makedirs(f"{folder_name}")
+
+    from tqdm import tqdm
+    c1 = 0
+    if settings.VECTOR_STORE == "faiss":
+        docs_init = [docs[0]]
+        docs.pop(0)
+
+        store = VectorCreator.create_vectorstore(
+            settings.VECTOR_STORE,
+            docs_init = docs_init,
+            path=f"{folder_name}",
+            embeddings_key=os.getenv("EMBEDDINGS_KEY")
+        )
+    else:
+        store = VectorCreator.create_vectorstore(
+            settings.VECTOR_STORE,
+            path=f"{folder_name}",
+            embeddings_key=os.getenv("EMBEDDINGS_KEY")
+        )
+    # Uncomment for MPNet embeddings
+    # model_name = "sentence-transformers/all-mpnet-base-v2"
+    # hf = HuggingFaceEmbeddings(model_name=model_name)
+    # store = FAISS.from_documents(docs_test, hf)
+    s1 = len(docs)
+    for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
+                  bar_format='{l_bar}{bar}| Time Left: {remaining}'):
+        try:
+            task_status.update_state(state='PROGRESS', meta={'current': int((c1 / s1) * 100)})
+            store_add_texts_with_retry(store, i)
+        except Exception as e:
+            print(e)
+            print("Error on ", i)
+            print("Saving progress")
+            print(f"stopped at {c1} out of {len(docs)}")
+            store.save_local(f"{folder_name}")
+            break
+        c1 += 1
+    if settings.VECTOR_STORE == "faiss":
+        store.save_local(f"{folder_name}")
+
+
+def get_user_permission(docs, folder_name):
+    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
+    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
+    # docs_content = (" ".join(docs))
+    docs_content = ""
+    for doc in docs:
+        docs_content += doc.page_content
+
+    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
+    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
+    print(f"Number of Tokens = {format(tokens, ',d')}")
+    print(f"Approx Cost = ${format(total_price, ',.2f')}")
+    # Here we check for user permission before calling the API.
+    user_input = input("Price Okay? (Y/N) \n").lower()
+    if user_input == "y":
+        call_openai_api(docs, folder_name)
+    elif user_input == "":
+        call_openai_api(docs, folder_name)
+    else:
+        print("The API was not called. No money was spent.")
diff --git a/parser/py2doc.py b/parser/py2doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a8175d4f84fa9c9a3f0859d14fba32446194878
--- /dev/null
+++ b/parser/py2doc.py
@@ -0,0 +1,121 @@
+import ast
+import os
+from pathlib import Path
+
+import tiktoken
+from langchain.llms import OpenAI
+from langchain.prompts import PromptTemplate
+
+
+def find_files(directory):
+    files_list = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.py'):
+                files_list.append(os.path.join(root, file))
+    return files_list
+
+
+def extract_functions(file_path):
+    with open(file_path, 'r') as file:
+        source_code = file.read()
+        functions = {}
+        tree = ast.parse(source_code)
+        for node in ast.walk(tree):
+            if isinstance(node, ast.FunctionDef):
+                func_name = node.name
+                func_def = ast.get_source_segment(source_code, node)
+                functions[func_name] = func_def
+    return functions
+
+
+def extract_classes(file_path):
+    with open(file_path, 'r') as file:
+        source_code = file.read()
+        classes = {}
+        tree = ast.parse(source_code)
+        for node in ast.walk(tree):
+            if isinstance(node, ast.ClassDef):
+                class_name = node.name
+                function_names = []
+                for subnode in ast.walk(node):
+                    if isinstance(subnode, ast.FunctionDef):
+                        function_names.append(subnode.name)
+                classes[class_name] = ", ".join(function_names)
+    return classes
+
+
+def extract_functions_and_classes(directory):
+    files = find_files(directory)
+    functions_dict = {}
+    classes_dict = {}
+    for file in files:
+        functions = extract_functions(file)
+        if functions:
+            functions_dict[file] = functions
+        classes = extract_classes(file)
+        if classes:
+            classes_dict[file] = classes
+    return functions_dict, classes_dict
+
+
+def parse_functions(functions_dict, formats, dir):
+    c1 = len(functions_dict)
+    for i, (source, functions) in enumerate(functions_dict.items(), start=1):
+        print(f"Processing file {i}/{c1}")
+        source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
+        subfolders = "/".join(source_w.split("/")[:-1])
+        Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
+        for j, (name, function) in enumerate(functions.items(), start=1):
+            print(f"Processing function {j}/{len(functions)}")
+            prompt = PromptTemplate(
+                input_variables=["code"],
+                template="Code: \n{code}, \nDocumentation: ",
+            )
+            llm = OpenAI(temperature=0)
+            response = llm(prompt.format(code=function))
+            mode = "a" if Path(f"outputs/{source_w}").exists() else "w"
+            with open(f"outputs/{source_w}", mode) as f:
+                f.write(
+                    f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
+
+
+def parse_classes(classes_dict, formats, dir):
+    c1 = len(classes_dict)
+    for i, (source, classes) in enumerate(classes_dict.items()):
+        print(f"Processing file {i + 1}/{c1}")
+        source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
+        subfolders = "/".join(source_w.split("/")[:-1])
+        Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
+        for name, function_names in classes.items():
+            print(f"Processing Class {i + 1}/{c1}")
+            prompt = PromptTemplate(
+                input_variables=["class_name", "functions_names"],
+                template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ",
+            )
+            llm = OpenAI(temperature=0)
+            response = llm(prompt.format(class_name=name, functions_names=function_names))
+
+            with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f:
+                f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")
+
+
+def transform_to_docs(functions_dict, classes_dict, formats, dir):
+    docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()])
+    docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()])
+
+    num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(docs_content))
+    total_price = ((num_tokens / 1000) * 0.02)
+
+    print(f"Number of Tokens = {num_tokens:,d}")
+    print(f"Approx Cost = ${total_price:,.2f}")
+
+    user_input = input("Price Okay? (Y/N)\n").lower()
+    if user_input == "y" or user_input == "":
+        if not Path("outputs").exists():
+            Path("outputs").mkdir()
+        parse_functions(functions_dict, formats, dir)
+        parse_classes(classes_dict, formats, dir)
+        print("All done!")
+    else:
+        print("The API was not called. No money was spent.")
diff --git a/parser/schema/__init__.py b/parser/schema/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/parser/schema/__init__.py
@@ -0,0 +1 @@
+
diff --git a/parser/schema/__pycache__/__init__.cpython-310.pyc b/parser/schema/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65128e835f4b729edc091ba37357de4b3696d768
Binary files /dev/null and b/parser/schema/__pycache__/__init__.cpython-310.pyc differ
diff --git a/parser/schema/__pycache__/base.cpython-310.pyc b/parser/schema/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48f6c6deaa7a973b689477ad70579aee6f9b0189
Binary files /dev/null and b/parser/schema/__pycache__/base.cpython-310.pyc differ
diff --git a/parser/schema/__pycache__/schema.cpython-310.pyc b/parser/schema/__pycache__/schema.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20c1685c5ac0ad4cb961337a53c844eb4a3fec56
Binary files /dev/null and b/parser/schema/__pycache__/schema.cpython-310.pyc differ
diff --git a/parser/schema/base.py b/parser/schema/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..61670f9a6a28d18ca675ddb2402312e1ba6e031b
--- /dev/null
+++ b/parser/schema/base.py
@@ -0,0 +1,34 @@
+"""Base schema for readers."""
+from dataclasses import dataclass
+
+from langchain.docstore.document import Document as LCDocument
+from application.parser.schema.schema import BaseDocument
+
+
+@dataclass
+class Document(BaseDocument):
+    """Generic interface for a data document.
+
+    This document connects to data sources.
+
+    """
+
+    def __post_init__(self) -> None:
+        """Post init."""
+        if self.text is None:
+            raise ValueError("text field not set.")
+
+    @classmethod
+    def get_type(cls) -> str:
+        """Get Document type."""
+        return "Document"
+
+    def to_langchain_format(self) -> LCDocument:
+        """Convert struct to LangChain document format."""
+        metadata = self.extra_info or {}
+        return LCDocument(page_content=self.text, metadata=metadata)
+
+    @classmethod
+    def from_langchain_format(cls, doc: LCDocument) -> "Document":
+        """Convert struct from LangChain document format."""
+        return cls(text=doc.page_content, extra_info=doc.metadata)
diff --git a/parser/schema/schema.py b/parser/schema/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec467e5a74f81e11b21a7965c94c1986708445d6
--- /dev/null
+++ b/parser/schema/schema.py
@@ -0,0 +1,64 @@
+"""Base schema for data structures."""
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from dataclasses_json import DataClassJsonMixin
+
+
+@dataclass
+class BaseDocument(DataClassJsonMixin):
+    """Base document.
+
+    Generic abstract interfaces that captures both index structs
+    as well as documents.
+
+    """
+
+    # TODO: consolidate fields from Document/IndexStruct into base class
+    text: Optional[str] = None
+    doc_id: Optional[str] = None
+    embedding: Optional[List[float]] = None
+
+    # extra fields
+    extra_info: Optional[Dict[str, Any]] = None
+
+    @classmethod
+    @abstractmethod
+    def get_type(cls) -> str:
+        """Get Document type."""
+
+    def get_text(self) -> str:
+        """Get text."""
+        if self.text is None:
+            raise ValueError("text field not set.")
+        return self.text
+
+    def get_doc_id(self) -> str:
+        """Get doc_id."""
+        if self.doc_id is None:
+            raise ValueError("doc_id not set.")
+        return self.doc_id
+
+    @property
+    def is_doc_id_none(self) -> bool:
+        """Check if doc_id is None."""
+        return self.doc_id is None
+
+    def get_embedding(self) -> List[float]:
+        """Get embedding.
+
+        Errors if embedding is None.
+
+        """
+        if self.embedding is None:
+            raise ValueError("embedding not set.")
+        return self.embedding
+
+    @property
+    def extra_info_str(self) -> Optional[str]:
+        """Extra info string."""
+        if self.extra_info is None:
+            return None
+
+        return "\n".join([f"{k}: {str(v)}" for k, v in self.extra_info.items()])
diff --git a/parser/token_func.py b/parser/token_func.py
new file mode 100644
index 0000000000000000000000000000000000000000..14b231fcda46b78b6d04c7757e6aa17ce5285b8c
--- /dev/null
+++ b/parser/token_func.py
@@ -0,0 +1,77 @@
+import re
+from math import ceil
+from typing import List
+
+import tiktoken
+from application.parser.schema.base import Document
+
+
+def separate_header_and_body(text):
+    header_pattern = r"^(.*?\n){3}"
+    match = re.match(header_pattern, text)
+    header = match.group(0)
+    body = text[len(header):]
+    return header, body
+
+
+def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
+    docs = []
+    current_group = None
+
+    for doc in documents:
+        doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
+
+        if current_group is None:
+            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
+                                     extra_info=doc.extra_info)
+        elif len(tiktoken.get_encoding("cl100k_base").encode(
+                current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
+            current_group.text += " " + doc.text
+        else:
+            docs.append(current_group)
+            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
+                                     extra_info=doc.extra_info)
+
+    if current_group is not None:
+        docs.append(current_group)
+
+    return docs
+
+
+def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
+    docs = []
+    for doc in documents:
+        token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
+        if token_length <= max_tokens:
+            docs.append(doc)
+        else:
+            header, body = separate_header_and_body(doc.text)
+            if len(tiktoken.get_encoding("cl100k_base").encode(header)) > max_tokens:
+                body = doc.text
+                header = ""
+            num_body_parts = ceil(token_length / max_tokens)
+            part_length = ceil(len(body) / num_body_parts)
+            body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)]
+            for i, body_part in enumerate(body_parts):
+                new_doc = Document(text=header + body_part.strip(),
+                                   doc_id=f"{doc.doc_id}-{i}",
+                                   embedding=doc.embedding,
+                                   extra_info=doc.extra_info)
+                docs.append(new_doc)
+    return docs
+
+
+def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
+    if not token_check:
+        return documents
+    print("Grouping small documents")
+    try:
+        documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
+    except Exception:
+        print("Grouping failed, try running without token_check")
+    print("Separating large documents")
+    try:
+        documents = split_documents(documents=documents, max_tokens=max_tokens)
+    except Exception:
+        print("Grouping failed, try running without token_check")
+    return documents
diff --git a/prompts/chat_combine_creative.txt b/prompts/chat_combine_creative.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2f9a61c9da8020d960e52b6fdea4643e5964cff7
--- /dev/null
+++ b/prompts/chat_combine_creative.txt
@@ -0,0 +1,9 @@
+You are a DocsGPT, friendly and helpful AI assistant by Arc53 that provides help with documents. You give thorough answers with code examples if possible.
+Use the following pieces of context to help answer the users question. If its not relevant to the question, provide friendly responses.
+You have access to chat history, and can use it to help answer the question.
+When using code examples, use the following format:
+```(language)
+(code)
+```
+----------------
+{summaries}
\ No newline at end of file
diff --git a/prompts/chat_combine_default.txt b/prompts/chat_combine_default.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fdf0b2c25d7d59c9f7c99c2498c8b3cad375bd36
--- /dev/null
+++ b/prompts/chat_combine_default.txt
@@ -0,0 +1,9 @@
+You are a helpful AI assistant, DocsGPT, specializing in document assistance, designed to offer detailed and informative responses. 
+If appropriate, your answers can include code examples, formatted as follows:
+```(language)
+(code)
+```
+You effectively utilize chat history, ensuring relevant and tailored responses. 
+If a question doesn't align with your context, you provide friendly and helpful replies.
+----------------
+{summaries}
\ No newline at end of file
diff --git a/prompts/chat_combine_strict.txt b/prompts/chat_combine_strict.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b6de46303318a1ca31e20624596f38b40754254d
--- /dev/null
+++ b/prompts/chat_combine_strict.txt
@@ -0,0 +1,13 @@
+You are an AI Assistant, DocsGPT, adept at offering document assistance. 
+Your expertise lies in providing answer on top of provided context. 
+You can leverage the chat history if needed.
+Answer the question based on the context below. 
+Keep the answer concise. Respond "Irrelevant context" if not sure about the answer.
+If question is not related to the context, respond "Irrelevant context".
+When using code examples, use the following format:
+```(language)
+(code)
+```
+ ----------------
+ Context:
+ {summaries}
\ No newline at end of file
diff --git a/prompts/chat_reduce_prompt.txt b/prompts/chat_reduce_prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a5842d878d56ea2c9ba892e473aff74263e1781b
--- /dev/null
+++ b/prompts/chat_reduce_prompt.txt
@@ -0,0 +1,3 @@
+Use the following pieces of context to help answer the users question. If its not relevant to the question, respond with "-"
+----------------
+{context}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c1bcb63331101e2bac426297bbb2ad0c829b20b2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,145 @@
+aiodns==3.1.1
+aiohttp==3.9.1
+aiohttp-retry==2.8.3
+aiosignal==1.3.1
+amqp==5.2.0
+annotated-types==0.6.0
+anthropic==0.8.0
+anyio==4.2.0
+async-timeout==4.0.3
+attrs==23.1.0
+billiard==4.2.0
+blinker==1.7.0
+blobfile==2.1.1
+boto3==1.34.6
+botocore==1.34.6
+celery==5.3.6
+certifi==2023.11.17
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+click==8.1.7
+click-didyoumean==0.3.0
+click-plugins==1.1.1
+click-repl==0.3.0
+cryptography==41.0.7
+dataclasses-json==0.6.3
+decorator==5.1.1
+dill==0.3.7
+distro==1.8.0
+dnspython==2.4.2
+docx2txt==0.8
+ecdsa==0.18.0
+elastic-transport==8.11.0
+elasticsearch==8.11.1
+entrypoints==0.4
+faiss-cpu==1.7.4
+filelock==3.13.1
+Flask==3.0.0
+Flask-Cors==4.0.0
+frozenlist==1.4.1
+fsspec==2023.12.2
+geojson==2.5.0
+greenlet==3.0.3
+gunicorn==21.2.0
+h11==0.14.0
+httpcore==1.0.2
+httpx==0.26.0
+huggingface-hub==0.20.1
+humbug==0.3.2
+idna==3.6
+iniconfig==2.0.0
+itsdangerous==2.1.2
+Jinja2==3.1.2
+jmespath==1.0.1
+joblib==1.3.2
+jsonpatch==1.33
+jsonpointer==2.4
+kombu==5.3.4
+langchain==0.0.352
+langchain-community==0.0.6
+langchain-core==0.1.3
+langsmith==0.0.74
+lazy-object-proxy==1.10.0
+loguru==0.7.2
+lxml==4.9.4
+MarkupSafe==2.1.3
+marshmallow==3.20.1
+marshmallow-enum==1.5.1
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+mypy-extensions==1.0.0
+networkx==3.2.1
+nltk==3.8.1
+npx==0.1.1
+numcodecs==0.12.1
+numpy==1.26.2
+openai==1.6.1
+openapi-schema-validator==0.6.2
+openapi-spec-validator==0.6.0
+openapi3-parser==1.1.16
+packaging==23.2
+pathable==0.4.3
+pathos==0.3.1
+Pillow==10.1.0
+pluggy==1.3.0
+pox==0.3.3
+ppft==1.7.6.7
+prance==23.6.21.0
+prompt-toolkit==3.0.43
+pyasn1==0.5.1
+pycares==4.4.0
+pycparser==2.21
+pycryptodome==3.19.0
+pycryptodomex==3.19.0
+pydantic==2.5.3
+pydantic_core==2.14.6
+pydantic_settings==2.1.0
+PyJWT==2.8.0
+pymongo==4.6.1
+pyowm==3.3.0
+PyPDF2==3.0.1
+PySocks==1.7.1
+pytest==7.4.3
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-jose==3.3.0
+python-liquid==1.10.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+redis==5.0.1
+regex==2023.10.3
+requests==2.31.0
+retry==0.9.2
+rfc3339-validator==0.1.4
+rpds-py==0.15.2
+rsa==4.9
+ruamel.yaml==0.18.5
+ruamel.yaml.clib==0.2.8
+s3transfer==0.10.0
+safetensors==0.4.1
+scikit-learn==1.3.2
+scipy==1.11.4
+sentence-transformers
+sentencepiece==0.1.99
+six==1.16.0
+sniffio==1.3.0
+SQLAlchemy==2.0.23
+sympy==1.12
+tenacity==8.2.3
+threadpoolctl==3.2.0
+tiktoken
+tokenizers==0.15.0
+torch==2.1.2
+torchvision==0.16.2
+tqdm==4.66.1
+transformers==4.36.2
+typer==0.9.0
+typing-inspect==0.9.0
+typing_extensions==4.9.0
+tzdata==2023.3
+vine==5.1.0
+wcwidth==0.2.12
+Werkzeug==3.0.1
+yarl==1.9.4
diff --git a/vectorstore/__init__.py b/vectorstore/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vectorstore/__pycache__/__init__.cpython-310.pyc b/vectorstore/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dccc9409174f1a476def628165a550a276eb032d
Binary files /dev/null and b/vectorstore/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vectorstore/__pycache__/base.cpython-310.pyc b/vectorstore/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e3eb4d7ea29c66903749c3a4880a2f431b171d6
Binary files /dev/null and b/vectorstore/__pycache__/base.cpython-310.pyc differ
diff --git a/vectorstore/__pycache__/document_class.cpython-310.pyc b/vectorstore/__pycache__/document_class.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbff6ead5eea51b8eb90e3766fda5813c36645cf
Binary files /dev/null and b/vectorstore/__pycache__/document_class.cpython-310.pyc differ
diff --git a/vectorstore/__pycache__/elasticsearch.cpython-310.pyc b/vectorstore/__pycache__/elasticsearch.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..212c6c87e6c3e09f72f5d93caefac1b5a359db79
Binary files /dev/null and b/vectorstore/__pycache__/elasticsearch.cpython-310.pyc differ
diff --git a/vectorstore/__pycache__/faiss.cpython-310.pyc b/vectorstore/__pycache__/faiss.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..089f4719c028edbaeba0266ff82a64c1bd938dd8
Binary files /dev/null and b/vectorstore/__pycache__/faiss.cpython-310.pyc differ
diff --git a/vectorstore/__pycache__/mongodb.cpython-310.pyc b/vectorstore/__pycache__/mongodb.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d81e69065cafeb121a7117ba434ed4efab0f115b
Binary files /dev/null and b/vectorstore/__pycache__/mongodb.cpython-310.pyc differ
diff --git a/vectorstore/__pycache__/vector_creator.cpython-310.pyc b/vectorstore/__pycache__/vector_creator.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30f15e71989fcd9fffc10ee848875dd4bad2e1bb
Binary files /dev/null and b/vectorstore/__pycache__/vector_creator.cpython-310.pyc differ
diff --git a/vectorstore/base.py b/vectorstore/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffff49b6a262417bac4b0ae58b121b07abe1cd65
--- /dev/null
+++ b/vectorstore/base.py
@@ -0,0 +1,56 @@
+from abc import ABC, abstractmethod
+import os
+from langchain.embeddings import (
+    OpenAIEmbeddings,
+    HuggingFaceEmbeddings,
+    CohereEmbeddings,
+    HuggingFaceInstructEmbeddings,
+)
+from application.core.settings import settings
+
+class BaseVectorStore(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def search(self, *args, **kwargs):
+        pass
+
+    def is_azure_configured(self):
+        return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME
+
+    def _get_embeddings(self, embeddings_name, embeddings_key=None):
+        embeddings_factory = {
+            "openai_text-embedding-ada-002": OpenAIEmbeddings,
+            "huggingface_sentence-transformers/all-mpnet-base-v2": HuggingFaceEmbeddings,
+            "huggingface_hkunlp/instructor-large": HuggingFaceInstructEmbeddings,
+            "cohere_medium": CohereEmbeddings
+        }
+        
+        if embeddings_name not in embeddings_factory:
+            raise ValueError(f"Invalid embeddings_name: {embeddings_name}")
+
+        if embeddings_name == "openai_text-embedding-ada-002":
+            if self.is_azure_configured():
+                os.environ["OPENAI_API_TYPE"] = "azure"
+                embedding_instance = embeddings_factory[embeddings_name](
+                    model=settings.AZURE_EMBEDDINGS_DEPLOYMENT_NAME
+                )
+            else:
+                embedding_instance = embeddings_factory[embeddings_name](
+                    openai_api_key=embeddings_key
+                )
+        elif embeddings_name == "cohere_medium":
+            embedding_instance = embeddings_factory[embeddings_name](
+                cohere_api_key=embeddings_key
+            )
+        elif embeddings_name == "huggingface_sentence-transformers/all-mpnet-base-v2":
+            embedding_instance = embeddings_factory[embeddings_name](
+                model_name="./model/all-mpnet-base-v2",
+                model_kwargs={"device": "cpu"},
+            )
+        else:
+            embedding_instance = embeddings_factory[embeddings_name]()
+            
+        return embedding_instance
+
diff --git a/vectorstore/document_class.py b/vectorstore/document_class.py
new file mode 100644
index 0000000000000000000000000000000000000000..30d70a560e210ecf71c5bc84fba7f9d8733afe95
--- /dev/null
+++ b/vectorstore/document_class.py
@@ -0,0 +1,8 @@
+class Document(str):
+    """Class for storing a piece of text and associated metadata."""
+
+    def __new__(cls, page_content: str, metadata: dict):
+        instance = super().__new__(cls, page_content)
+        instance.page_content = page_content
+        instance.metadata = metadata
+        return instance
diff --git a/vectorstore/elasticsearch.py b/vectorstore/elasticsearch.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb28d5cef8bae067a7c09c6a3aea0ca054dec7b7
--- /dev/null
+++ b/vectorstore/elasticsearch.py
@@ -0,0 +1,213 @@
+from application.vectorstore.base import BaseVectorStore
+from application.core.settings import settings
+from application.vectorstore.document_class import Document
+import elasticsearch
+
+
+
+
+class ElasticsearchStore(BaseVectorStore):
+    _es_connection = None  # Class attribute to hold the Elasticsearch connection
+
+    def __init__(self, path, embeddings_key, index_name=settings.ELASTIC_INDEX):
+        super().__init__()
+        self.path = path.replace("application/indexes/", "").rstrip("/")
+        self.embeddings_key = embeddings_key
+        self.index_name = index_name
+        
+        if ElasticsearchStore._es_connection is None:
+            connection_params = {}
+            if settings.ELASTIC_URL:
+                connection_params["hosts"] = [settings.ELASTIC_URL]
+                connection_params["http_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD)
+            elif settings.ELASTIC_CLOUD_ID:
+                connection_params["cloud_id"] = settings.ELASTIC_CLOUD_ID
+                connection_params["basic_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD)
+            else:
+                raise ValueError("Please provide either elasticsearch_url or cloud_id.")
+
+            
+
+            ElasticsearchStore._es_connection = elasticsearch.Elasticsearch(**connection_params)
+            
+        self.docsearch = ElasticsearchStore._es_connection
+
+    def connect_to_elasticsearch(
+        *,
+        es_url = None,
+        cloud_id = None,
+        api_key = None,
+        username = None,
+        password = None,
+    ):
+        try:
+            import elasticsearch
+        except ImportError:
+            raise ImportError(
+                "Could not import elasticsearch python package. "
+                "Please install it with `pip install elasticsearch`."
+            )
+
+        if es_url and cloud_id:
+            raise ValueError(
+                "Both es_url and cloud_id are defined. Please provide only one."
+            )
+
+        connection_params = {}
+
+        if es_url:
+            connection_params["hosts"] = [es_url]
+        elif cloud_id:
+            connection_params["cloud_id"] = cloud_id
+        else:
+            raise ValueError("Please provide either elasticsearch_url or cloud_id.")
+
+        if api_key:
+            connection_params["api_key"] = api_key
+        elif username and password:
+            connection_params["basic_auth"] = (username, password)
+
+        es_client = elasticsearch.Elasticsearch(
+            **connection_params,
+        )
+        try:
+            es_client.info()
+        except Exception as e:
+            raise e
+
+        return es_client
+
+    def search(self, question, k=2, index_name=settings.ELASTIC_INDEX, *args, **kwargs):
+        embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
+        vector = embeddings.embed_query(question)
+        knn = {
+            "filter": [{"match": {"metadata.store.keyword": self.path}}],
+            "field": "vector",
+            "k": k,
+            "num_candidates": 100,
+            "query_vector": vector,
+        }
+        full_query = {
+            "knn": knn,
+            "query": {
+                "bool": {
+                    "must": [
+                        {
+                            "match": {
+                                "text": {
+                                    "query": question,
+                                }
+                            }
+                        }
+                    ],
+                    "filter": [{"match": {"metadata.store.keyword": self.path}}],
+                }
+            },
+            "rank": {"rrf": {}},
+        }
+        resp = self.docsearch.search(index=self.index_name, query=full_query['query'], size=k, knn=full_query['knn'])
+        # create Documents objects from the results page_content ['_source']['text'], metadata ['_source']['metadata']
+        doc_list = []
+        for hit in resp['hits']['hits']:
+            
+            doc_list.append(Document(page_content = hit['_source']['text'], metadata = hit['_source']['metadata']))
+        return doc_list
+
+    def _create_index_if_not_exists(
+            self, index_name, dims_length
+        ):
+
+        if self._es_connection.indices.exists(index=index_name):
+            print(f"Index {index_name} already exists.")
+
+        else:
+
+            indexSettings = self.index(
+                dims_length=dims_length,
+            )
+            self._es_connection.indices.create(index=index_name, **indexSettings)
+
+    def index(
+            self,
+            dims_length,
+        ):
+        return {
+            "mappings": {
+                "properties": {
+                    "vector": {
+                        "type": "dense_vector",
+                        "dims": dims_length,
+                        "index": True,
+                        "similarity": "cosine",
+                    },
+                }
+            }
+        }
+
+    def add_texts(
+        self,
+        texts,
+        metadatas = None,
+        ids = None,
+        refresh_indices = True,
+        create_index_if_not_exists = True,
+        bulk_kwargs = None,
+        **kwargs,
+        ):
+        
+        from elasticsearch.helpers import BulkIndexError, bulk
+
+        bulk_kwargs = bulk_kwargs or {}
+        import uuid
+        embeddings = []
+        ids = ids or [str(uuid.uuid4()) for _ in texts]
+        requests = []
+        embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
+
+        vectors = embeddings.embed_documents(list(texts))
+
+        dims_length = len(vectors[0])
+
+        if create_index_if_not_exists:
+            self._create_index_if_not_exists(
+                index_name=self.index_name, dims_length=dims_length
+            )
+
+        for i, (text, vector) in enumerate(zip(texts, vectors)):
+            metadata = metadatas[i] if metadatas else {}
+
+            requests.append(
+                {
+                    "_op_type": "index",
+                    "_index": self.index_name,
+                    "text": text,
+                    "vector": vector,
+                    "metadata": metadata,
+                    "_id": ids[i],
+                }
+            )
+
+
+        if len(requests) > 0:
+            try:
+                success, failed = bulk(
+                    self._es_connection,
+                    requests,
+                    stats_only=True,
+                    refresh=refresh_indices,
+                    **bulk_kwargs,
+                )
+                return ids
+            except BulkIndexError as e:
+                print(f"Error adding texts: {e}")
+                firstError = e.errors[0].get("index", {}).get("error", {})
+                print(f"First error reason: {firstError.get('reason')}")
+                raise e
+
+        else:
+            return []
+
+    def delete_index(self):
+        self._es_connection.delete_by_query(index=self.index_name, query={"match": {
+                                      "metadata.store.keyword": self.path}},)
+
diff --git a/vectorstore/faiss.py b/vectorstore/faiss.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a0a7b823bbc731dc7f3fa521811c641d335d578
--- /dev/null
+++ b/vectorstore/faiss.py
@@ -0,0 +1,46 @@
+from langchain.vectorstores import FAISS
+from application.vectorstore.base import BaseVectorStore
+from application.core.settings import settings
+
+class FaissStore(BaseVectorStore):
+
+    def __init__(self, path, embeddings_key, docs_init=None):
+        super().__init__()
+        self.path = path
+        embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
+        if docs_init:
+            self.docsearch = FAISS.from_documents(
+                docs_init, embeddings
+            )
+        else:
+            self.docsearch = FAISS.load_local(
+                self.path, embeddings
+            )
+        self.assert_embedding_dimensions(embeddings)
+
+    def search(self, *args, **kwargs):
+        return self.docsearch.similarity_search(*args, **kwargs)
+
+    def add_texts(self, *args, **kwargs):
+        return self.docsearch.add_texts(*args, **kwargs)
+
+    def save_local(self, *args, **kwargs):
+        return self.docsearch.save_local(*args, **kwargs)
+
+    def delete_index(self, *args, **kwargs):
+        return self.docsearch.delete(*args, **kwargs)
+
+    def assert_embedding_dimensions(self, embeddings):
+        """
+        Check that the word embedding dimension of the docsearch index matches
+        the dimension of the word embeddings used 
+        """
+        if settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2":
+            try:
+                word_embedding_dimension = embeddings.client[1].word_embedding_dimension
+            except AttributeError as e:
+                raise AttributeError("word_embedding_dimension not found in embeddings.client[1]") from e
+            docsearch_index_dimension = self.docsearch.index.d
+            if word_embedding_dimension != docsearch_index_dimension:
+                raise ValueError(f"word_embedding_dimension ({word_embedding_dimension}) " +
+                                 f"!= docsearch_index_word_embedding_dimension ({docsearch_index_dimension})")
\ No newline at end of file
diff --git a/vectorstore/mongodb.py b/vectorstore/mongodb.py
new file mode 100644
index 0000000000000000000000000000000000000000..337fc41fe3c79eb59409046095cb96fe177216a7
--- /dev/null
+++ b/vectorstore/mongodb.py
@@ -0,0 +1,126 @@
+from application.vectorstore.base import BaseVectorStore
+from application.core.settings import settings
+from application.vectorstore.document_class import Document
+
+class MongoDBVectorStore(BaseVectorStore):
+    def __init__(
+        self,
+        path: str = "",
+        embeddings_key: str = "embeddings",
+        collection: str = "documents",
+        index_name: str = "vector_search_index",
+        text_key: str = "text",
+        embedding_key: str = "embedding",
+        database: str = "docsgpt",
+    ):
+        self._index_name = index_name
+        self._text_key = text_key
+        self._embedding_key = embedding_key
+        self._embeddings_key = embeddings_key
+        self._mongo_uri = settings.MONGO_URI
+        self._path = path.replace("application/indexes/", "").rstrip("/")
+        self._embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
+
+        try:
+            import pymongo
+        except ImportError:
+            raise ImportError(
+                "Could not import pymongo python package. "
+                "Please install it with `pip install pymongo`."
+            )
+
+        self._client = pymongo.MongoClient(self._mongo_uri)
+        self._database = self._client[database]
+        self._collection = self._database[collection]
+
+        
+    def search(self, question, k=2, *args, **kwargs):
+        query_vector = self._embedding.embed_query(question)
+
+        pipeline = [
+            {
+                "$vectorSearch": {
+                    "queryVector": query_vector, 
+                    "path": self._embedding_key,
+                    "limit": k, 
+                    "numCandidates": k * 10, 
+                    "index": self._index_name,
+                    "filter": {
+                        "store": {"$eq": self._path}
+                    }
+                }
+            }
+        ]
+
+        cursor = self._collection.aggregate(pipeline)
+        
+        results = []
+        for doc in cursor:
+            text = doc[self._text_key]
+            doc.pop("_id")
+            doc.pop(self._text_key)
+            doc.pop(self._embedding_key)
+            metadata = doc
+            results.append(Document(text, metadata))
+        return results
+    
+    def _insert_texts(self, texts, metadatas):
+        if not texts:
+            return []
+        embeddings = self._embedding.embed_documents(texts)
+        to_insert = [
+            {self._text_key: t, self._embedding_key: embedding, **m}
+            for t, m, embedding in zip(texts, metadatas, embeddings)
+        ]
+        # insert the documents in MongoDB Atlas
+        insert_result = self._collection.insert_many(to_insert)
+        return insert_result.inserted_ids
+    
+    def add_texts(self,
+        texts,
+        metadatas = None,
+        ids = None,
+        refresh_indices = True,
+        create_index_if_not_exists = True,
+        bulk_kwargs = None,
+        **kwargs,):
+
+
+        #dims = self._embedding.client[1].word_embedding_dimension
+        # # check if index exists
+        # if create_index_if_not_exists:
+        #     # check if index exists
+        #     info = self._collection.index_information()
+        #     if self._index_name not in info:
+        #         index_mongo = {
+        #         "fields": [{
+        #             "type": "vector",
+        #             "path": self._embedding_key,
+        #             "numDimensions": dims,
+        #             "similarity": "cosine",
+        #         },
+        #         {
+        #             "type": "filter",
+        #             "path": "store"
+        #         }]
+        #         }
+        #         self._collection.create_index(self._index_name, index_mongo)
+
+        batch_size = 100
+        _metadatas = metadatas or ({} for _ in texts)
+        texts_batch = []
+        metadatas_batch = []
+        result_ids = []
+        for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
+            texts_batch.append(text)
+            metadatas_batch.append(metadata)
+            if (i + 1) % batch_size == 0:
+                result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
+                texts_batch = []
+                metadatas_batch = []
+        if texts_batch:
+            result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
+        return result_ids
+    
+    def delete_index(self, *args, **kwargs):
+        self._collection.delete_many({"store": self._path})
\ No newline at end of file
diff --git a/vectorstore/vector_creator.py b/vectorstore/vector_creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..68ae281364e55adc5c9a394c608b4c028c4ce92b
--- /dev/null
+++ b/vectorstore/vector_creator.py
@@ -0,0 +1,18 @@
+from application.vectorstore.faiss import FaissStore
+from application.vectorstore.elasticsearch import ElasticsearchStore
+from application.vectorstore.mongodb import MongoDBVectorStore
+
+
+class VectorCreator:
+    vectorstores = {
+        'faiss': FaissStore,
+        'elasticsearch':ElasticsearchStore,
+        'mongodb': MongoDBVectorStore,
+    }
+
+    @classmethod
+    def create_vectorstore(cls, type, *args, **kwargs):
+        vectorstore_class = cls.vectorstores.get(type.lower())
+        if not vectorstore_class:
+            raise ValueError(f"No vectorstore class found for type {type}")
+        return vectorstore_class(*args, **kwargs)
\ No newline at end of file
diff --git a/worker.py b/worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae8f240c452cf3ba18c83c2340ab022ca02e781f
--- /dev/null
+++ b/worker.py
@@ -0,0 +1,123 @@
+import os
+import shutil
+import string
+import zipfile
+from urllib.parse import urljoin
+
+import nltk
+import requests
+
+from application.core.settings import settings
+from application.parser.file.bulk import SimpleDirectoryReader
+from application.parser.open_ai_func import call_openai_api
+from application.parser.schema.base import Document
+from application.parser.token_func import group_split
+
+try:
+    nltk.download('punkt', quiet=True)
+    nltk.download('averaged_perceptron_tagger', quiet=True)
+except FileExistsError:
+    pass
+
+
+# Define a function to extract metadata from a given filename.
+def metadata_from_filename(title):
+    store = '/'.join(title.split('/')[1:3])
+    return {'title': title, 'store': store}
+
+
+# Define a function to generate a random string of a given length.
+def generate_random_string(length):
+    return ''.join([string.ascii_letters[i % 52] for i in range(length)])
+
+current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Define the main function for ingesting and processing documents.
+def ingest_worker(self, directory, formats, name_job, filename, user):
+    """
+    Ingest and process documents.
+
+    Args:
+        self: Reference to the instance of the task.
+        directory (str): Specifies the directory for ingesting ('inputs' or 'temp').
+        formats (list of str): List of file extensions to consider for ingestion (e.g., [".rst", ".md"]).
+        name_job (str): Name of the job for this ingestion task.
+        filename (str): Name of the file to be ingested.
+        user (str): Identifier for the user initiating the ingestion.
+
+    Returns:
+        dict: Information about the completed ingestion task, including input parameters and a "limited" flag.
+    """
+    # directory = 'inputs' or 'temp'
+    # formats = [".rst", ".md"]
+    input_files = None
+    recursive = True
+    limit = None
+    exclude = True
+    # name_job = 'job1'
+    # filename = 'install.rst'
+    # user = 'local'
+    sample = False
+    token_check = True
+    min_tokens = 150
+    max_tokens = 1250
+    full_path = directory + '/' + user + '/' + name_job
+    import sys
+    print(full_path, file=sys.stderr)
+    # check if API_URL env variable is set
+    file_data = {'name': name_job, 'file': filename, 'user': user}
+    response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data)
+    # check if file is in the response
+    print(response, file=sys.stderr)
+    file = response.content
+
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+    with open(full_path + '/' + filename, 'wb') as f:
+        f.write(file)
+
+    # check if file is .zip and extract it
+    if filename.endswith('.zip'):
+        with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref:
+            zip_ref.extractall(full_path)
+        os.remove(full_path + '/' + filename)
+
+    self.update_state(state='PROGRESS', meta={'current': 1})
+
+    raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive,
+                                     required_exts=formats, num_files_limit=limit,
+                                     exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data()
+    raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
+
+    docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
+
+    call_openai_api(docs, full_path, self)
+    self.update_state(state='PROGRESS', meta={'current': 100})
+
+    if sample:
+        for i in range(min(5, len(raw_docs))):
+            print(raw_docs[i].text)
+
+    # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
+    # and send them to the server (provide user and name in form)
+    file_data = {'name': name_job, 'user': user}
+    if settings.VECTOR_STORE == "faiss":
+        files = {'file_faiss': open(full_path + '/index.faiss', 'rb'),
+                'file_pkl': open(full_path + '/index.pkl', 'rb')}
+        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
+        response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path))
+    else:
+        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
+
+    
+    # delete local
+    shutil.rmtree(full_path)
+
+    return {
+        'directory': directory,
+        'formats': formats,
+        'name_job': name_job,
+        'filename': filename,
+        'user': user,
+        'limited': False
+    }
diff --git a/wsgi.py b/wsgi.py
new file mode 100644
index 0000000000000000000000000000000000000000..5160e115ecbf009f5762714715b2cc1ce8c865d4
--- /dev/null
+++ b/wsgi.py
@@ -0,0 +1,4 @@
+from application.app import app
+
+if __name__ == "__main__":
+    app.run(debug=True, port=7091)