diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..81ed570aded5304a64bc9f91ea7e60e3758bccdf --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ +FROM python:3.11-slim-bullseye as builder + +# Tiktoken requires Rust toolchain, so build it in a separate stage +RUN apt-get update && apt-get install -y gcc curl +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && apt-get install --reinstall libc6-dev -y +ENV PATH="/root/.cargo/bin:${PATH}" +RUN pip install --upgrade pip && pip install tiktoken==0.5.2 +COPY requirements.txt . +RUN pip install -r requirements.txt +RUN apt-get install -y wget unzip +RUN wget https://d3dg1063dc54p9.cloudfront.net/models/embeddings/mpnet-base-v2.zip +RUN unzip mpnet-base-v2.zip -d model +RUN rm mpnet-base-v2.zip + +FROM python:3.11-slim-bullseye + +# Copy pre-built packages and binaries from builder stage +COPY --from=builder /usr/local/ /usr/local/ + +WORKDIR /app +COPY --from=builder /model /app/model + +COPY . /app/application +ENV FLASK_APP=app.py +ENV FLASK_DEBUG=true + +EXPOSE 7091 + +CMD ["gunicorn", "-w", "2", "--timeout", "120", "--bind", "0.0.0.0:7091", "application.wsgi:app"] diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/api/__pycache__/__init__.cpython-310.pyc b/api/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82dd1448e4ab7a588e98145c9ada38adf8341c28 Binary files /dev/null and b/api/__pycache__/__init__.cpython-310.pyc differ diff --git a/api/answer/__init__.py b/api/answer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/api/answer/__pycache__/__init__.cpython-310.pyc b/api/answer/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c70fdb2ad7bb4caf0568659d439d7ffbb64ea4c Binary files /dev/null and b/api/answer/__pycache__/__init__.cpython-310.pyc differ diff --git a/api/answer/__pycache__/routes.cpython-310.pyc b/api/answer/__pycache__/routes.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a13a7ea2c0c6756344941ae445611f9376c6693 Binary files /dev/null and b/api/answer/__pycache__/routes.cpython-310.pyc differ diff --git a/api/answer/routes.py b/api/answer/routes.py new file mode 100644 index 0000000000000000000000000000000000000000..8c661e0bdad49ff1a7e84580214bfed1e19435c3 --- /dev/null +++ b/api/answer/routes.py @@ -0,0 +1,371 @@ +import asyncio +import os +from flask import Blueprint, request, Response +import json +import datetime +import logging +import traceback + +from pymongo import MongoClient +from bson.objectid import ObjectId +from transformers import GPT2TokenizerFast + + + +from application.core.settings import settings +from application.vectorstore.vector_creator import VectorCreator +from application.llm.llm_creator import LLMCreator +from application.error import bad_request + + + +logger = logging.getLogger(__name__) + +mongo = MongoClient(settings.MONGO_URI) +db = mongo["docsgpt"] +conversations_collection = db["conversations"] +vectors_collection = db["vectors"] +prompts_collection = db["prompts"] +answer = Blueprint('answer', __name__) + +if settings.LLM_NAME == "gpt4": + gpt_model = 'gpt-4' +elif settings.LLM_NAME == "anthropic": + gpt_model = 'claude-2' +else: + gpt_model = 'gpt-3.5-turbo' + +# load the prompts +current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +with open(os.path.join(current_dir, "prompts", "chat_combine_default.txt"), "r") as f: + chat_combine_template = f.read() + +with open(os.path.join(current_dir, "prompts", "chat_reduce_prompt.txt"), "r") as f: + chat_reduce_template = f.read() + +with open(os.path.join(current_dir, "prompts", "chat_combine_creative.txt"), "r") as f: + chat_combine_creative = f.read() + +with open(os.path.join(current_dir, "prompts", "chat_combine_strict.txt"), "r") as f: + chat_combine_strict = f.read() + +api_key_set = settings.API_KEY is not None +embeddings_key_set = settings.EMBEDDINGS_KEY is not None + + +async def async_generate(chain, question, chat_history): + result = await chain.arun({"question": question, "chat_history": chat_history}) + return result + + +def count_tokens(string): + tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') + return len(tokenizer(string)['input_ids']) + + +def run_async_chain(chain, question, chat_history): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + result = {} + try: + answer = loop.run_until_complete(async_generate(chain, question, chat_history)) + finally: + loop.close() + result["answer"] = answer + return result + + +def get_vectorstore(data): + if "active_docs" in data: + if data["active_docs"].split("/")[0] == "default": + vectorstore = "" + elif data["active_docs"].split("/")[0] == "local": + vectorstore = "indexes/" + data["active_docs"] + else: + vectorstore = "vectors/" + data["active_docs"] + if data["active_docs"] == "default": + vectorstore = "" + else: + vectorstore = "" + vectorstore = os.path.join("application", vectorstore) + return vectorstore + + +def is_azure_configured(): + return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME + + +def complete_stream(question, docsearch, chat_history, api_key, prompt_id, conversation_id): + llm = LLMCreator.create_llm(settings.LLM_NAME, api_key=api_key) + + if prompt_id == 'default': + prompt = chat_combine_template + elif prompt_id == 'creative': + prompt = chat_combine_creative + elif prompt_id == 'strict': + prompt = chat_combine_strict + else: + prompt = prompts_collection.find_one({"_id": ObjectId(prompt_id)})["content"] + + docs = docsearch.search(question, k=2) + if settings.LLM_NAME == "llama.cpp": + docs = [docs[0]] + # join all page_content together with a newline + docs_together = "\n".join([doc.page_content for doc in docs]) + p_chat_combine = prompt.replace("{summaries}", docs_together) + messages_combine = [{"role": "system", "content": p_chat_combine}] + source_log_docs = [] + for doc in docs: + if doc.metadata: + source_log_docs.append({"title": doc.metadata['title'].split('/')[-1], "text": doc.page_content}) + else: + source_log_docs.append({"title": doc.page_content, "text": doc.page_content}) + + if len(chat_history) > 1: + tokens_current_history = 0 + # count tokens in history + chat_history.reverse() + for i in chat_history: + if "prompt" in i and "response" in i: + tokens_batch = count_tokens(i["prompt"]) + count_tokens(i["response"]) + if tokens_current_history + tokens_batch < settings.TOKENS_MAX_HISTORY: + tokens_current_history += tokens_batch + messages_combine.append({"role": "user", "content": i["prompt"]}) + messages_combine.append({"role": "system", "content": i["response"]}) + messages_combine.append({"role": "user", "content": question}) + + response_full = "" + completion = llm.gen_stream(model=gpt_model, engine=settings.AZURE_DEPLOYMENT_NAME, + messages=messages_combine) + for line in completion: + data = json.dumps({"answer": str(line)}) + response_full += str(line) + yield f"data: {data}\n\n" + + # save conversation to database + if conversation_id is not None: + conversations_collection.update_one( + {"_id": ObjectId(conversation_id)}, + {"$push": {"queries": {"prompt": question, "response": response_full, "sources": source_log_docs}}}, + ) + + else: + # create new conversation + # generate summary + messages_summary = [{"role": "assistant", "content": "Summarise following conversation in no more than 3 " + "words, respond ONLY with the summary, use the same " + "language as the system \n\nUser: " + question + "\n\n" + + "AI: " + + response_full}, + {"role": "user", "content": "Summarise following conversation in no more than 3 words, " + "respond ONLY with the summary, use the same language as the " + "system"}] + + completion = llm.gen(model=gpt_model, engine=settings.AZURE_DEPLOYMENT_NAME, + messages=messages_summary, max_tokens=30) + conversation_id = conversations_collection.insert_one( + {"user": "local", + "date": datetime.datetime.utcnow(), + "name": completion, + "queries": [{"prompt": question, "response": response_full, "sources": source_log_docs}]} + ).inserted_id + + # send data.type = "end" to indicate that the stream has ended as json + data = json.dumps({"type": "id", "id": str(conversation_id)}) + yield f"data: {data}\n\n" + data = json.dumps({"type": "end"}) + yield f"data: {data}\n\n" + + +@answer.route("/stream", methods=["POST"]) +def stream(): + data = request.get_json() + # get parameter from url question + question = data["question"] + history = data["history"] + # history to json object from string + history = json.loads(history) + conversation_id = data["conversation_id"] + if 'prompt_id' in data: + prompt_id = data["prompt_id"] + else: + prompt_id = 'default' + + # check if active_docs is set + + if not api_key_set: + api_key = data["api_key"] + else: + api_key = settings.API_KEY + if not embeddings_key_set: + embeddings_key = data["embeddings_key"] + else: + embeddings_key = settings.EMBEDDINGS_KEY + if "active_docs" in data: + vectorstore = get_vectorstore({"active_docs": data["active_docs"]}) + else: + vectorstore = "" + docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key) + + return Response( + complete_stream(question, docsearch, + chat_history=history, api_key=api_key, + prompt_id=prompt_id, + conversation_id=conversation_id), mimetype="text/event-stream" + ) + + +@answer.route("/api/answer", methods=["POST"]) +def api_answer(): + data = request.get_json() + question = data["question"] + history = data["history"] + if "conversation_id" not in data: + conversation_id = None + else: + conversation_id = data["conversation_id"] + print("-" * 5) + if not api_key_set: + api_key = data["api_key"] + else: + api_key = settings.API_KEY + if not embeddings_key_set: + embeddings_key = data["embeddings_key"] + else: + embeddings_key = settings.EMBEDDINGS_KEY + if 'prompt_id' in data: + prompt_id = data["prompt_id"] + else: + prompt_id = 'default' + + if prompt_id == 'default': + prompt = chat_combine_template + elif prompt_id == 'creative': + prompt = chat_combine_creative + elif prompt_id == 'strict': + prompt = chat_combine_strict + else: + prompt = prompts_collection.find_one({"_id": ObjectId(prompt_id)})["content"] + + # use try and except to check for exception + try: + # check if the vectorstore is set + vectorstore = get_vectorstore(data) + # loading the index and the store and the prompt template + # Note if you have used other embeddings than OpenAI, you need to change the embeddings + docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key) + + + llm = LLMCreator.create_llm(settings.LLM_NAME, api_key=api_key) + + + + docs = docsearch.search(question, k=2) + # join all page_content together with a newline + docs_together = "\n".join([doc.page_content for doc in docs]) + p_chat_combine = prompt.replace("{summaries}", docs_together) + messages_combine = [{"role": "system", "content": p_chat_combine}] + source_log_docs = [] + for doc in docs: + if doc.metadata: + source_log_docs.append({"title": doc.metadata['title'].split('/')[-1], "text": doc.page_content}) + else: + source_log_docs.append({"title": doc.page_content, "text": doc.page_content}) + # join all page_content together with a newline + + + if len(history) > 1: + tokens_current_history = 0 + # count tokens in history + history.reverse() + for i in history: + if "prompt" in i and "response" in i: + tokens_batch = count_tokens(i["prompt"]) + count_tokens(i["response"]) + if tokens_current_history + tokens_batch < settings.TOKENS_MAX_HISTORY: + tokens_current_history += tokens_batch + messages_combine.append({"role": "user", "content": i["prompt"]}) + messages_combine.append({"role": "system", "content": i["response"]}) + messages_combine.append({"role": "user", "content": question}) + + + completion = llm.gen(model=gpt_model, engine=settings.AZURE_DEPLOYMENT_NAME, + messages=messages_combine) + + + result = {"answer": completion, "sources": source_log_docs} + logger.debug(result) + + # generate conversationId + if conversation_id is not None: + conversations_collection.update_one( + {"_id": ObjectId(conversation_id)}, + {"$push": {"queries": {"prompt": question, + "response": result["answer"], "sources": result['sources']}}}, + ) + + else: + # create new conversation + # generate summary + messages_summary = [ + {"role": "assistant", "content": "Summarise following conversation in no more than 3 words, " + "respond ONLY with the summary, use the same language as the system \n\n" + "User: " + question + "\n\n" + "AI: " + result["answer"]}, + {"role": "user", "content": "Summarise following conversation in no more than 3 words, " + "respond ONLY with the summary, use the same language as the system"} + ] + + completion = llm.gen( + model=gpt_model, + engine=settings.AZURE_DEPLOYMENT_NAME, + messages=messages_summary, + max_tokens=30 + ) + conversation_id = conversations_collection.insert_one( + {"user": "local", + "date": datetime.datetime.utcnow(), + "name": completion, + "queries": [{"prompt": question, "response": result["answer"], "sources": source_log_docs}]} + ).inserted_id + + result["conversation_id"] = str(conversation_id) + + # mock result + # result = { + # "answer": "The answer is 42", + # "sources": ["https://en.wikipedia.org/wiki/42_(number)", "https://en.wikipedia.org/wiki/42_(number)"] + # } + return result + except Exception as e: + # print whole traceback + traceback.print_exc() + print(str(e)) + return bad_request(500, str(e)) + + +@answer.route("/api/search", methods=["POST"]) +def api_search(): + data = request.get_json() + # get parameter from url question + question = data["question"] + + if not embeddings_key_set: + embeddings_key = data["embeddings_key"] + else: + embeddings_key = settings.EMBEDDINGS_KEY + if "active_docs" in data: + vectorstore = get_vectorstore({"active_docs": data["active_docs"]}) + else: + vectorstore = "" + docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key) + + docs = docsearch.search(question, k=2) + + source_log_docs = [] + for doc in docs: + if doc.metadata: + source_log_docs.append({"title": doc.metadata['title'].split('/')[-1], "text": doc.page_content}) + else: + source_log_docs.append({"title": doc.page_content, "text": doc.page_content}) + #yield f"data:{data}\n\n" + return source_log_docs + diff --git a/api/internal/__init__.py b/api/internal/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/api/internal/__pycache__/__init__.cpython-310.pyc b/api/internal/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b12b34e3892e0c1b90eb2ab655589176c16a41c3 Binary files /dev/null and b/api/internal/__pycache__/__init__.cpython-310.pyc differ diff --git a/api/internal/__pycache__/routes.cpython-310.pyc b/api/internal/__pycache__/routes.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7f4e03e1c61968bbc4d3b4d28e9730bc5cc707e Binary files /dev/null and b/api/internal/__pycache__/routes.cpython-310.pyc differ diff --git a/api/internal/routes.py b/api/internal/routes.py new file mode 100644 index 0000000000000000000000000000000000000000..e8a1b80b151601de3b8a70392449f75f0a203c55 --- /dev/null +++ b/api/internal/routes.py @@ -0,0 +1,69 @@ +import os +import datetime +from flask import Blueprint, request, send_from_directory +from pymongo import MongoClient +from werkzeug.utils import secure_filename + + +from application.core.settings import settings +mongo = MongoClient(settings.MONGO_URI) +db = mongo["docsgpt"] +conversations_collection = db["conversations"] +vectors_collection = db["vectors"] + +current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +internal = Blueprint('internal', __name__) +@internal.route("/api/download", methods=["get"]) +def download_file(): + user = secure_filename(request.args.get("user")) + job_name = secure_filename(request.args.get("name")) + filename = secure_filename(request.args.get("file")) + save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name) + return send_from_directory(save_dir, filename, as_attachment=True) + + + +@internal.route("/api/upload_index", methods=["POST"]) +def upload_index_files(): + """Upload two files(index.faiss, index.pkl) to the user's folder.""" + if "user" not in request.form: + return {"status": "no user"} + user = secure_filename(request.form["user"]) + if "name" not in request.form: + return {"status": "no name"} + job_name = secure_filename(request.form["name"]) + save_dir = os.path.join(current_dir, "indexes", user, job_name) + if settings.VECTOR_STORE == "faiss": + if "file_faiss" not in request.files: + print("No file part") + return {"status": "no file"} + file_faiss = request.files["file_faiss"] + if file_faiss.filename == "": + return {"status": "no file name"} + if "file_pkl" not in request.files: + print("No file part") + return {"status": "no file"} + file_pkl = request.files["file_pkl"] + if file_pkl.filename == "": + return {"status": "no file name"} + # saves index files + + if not os.path.exists(save_dir): + os.makedirs(save_dir) + file_faiss.save(os.path.join(save_dir, "index.faiss")) + file_pkl.save(os.path.join(save_dir, "index.pkl")) + # create entry in vectors_collection + vectors_collection.insert_one( + { + "user": user, + "name": job_name, + "language": job_name, + "location": save_dir, + "date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), + "model": settings.EMBEDDINGS_NAME, + "type": "local", + } + ) + return {"status": "ok"} \ No newline at end of file diff --git a/api/user/__init__.py b/api/user/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/api/user/__pycache__/__init__.cpython-310.pyc b/api/user/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba17d39fe5a603209181f8748c5694fab37ccebf Binary files /dev/null and b/api/user/__pycache__/__init__.cpython-310.pyc differ diff --git a/api/user/__pycache__/routes.cpython-310.pyc b/api/user/__pycache__/routes.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d181c06e65243409e490bc8ef3e80d1fe8a6ea4 Binary files /dev/null and b/api/user/__pycache__/routes.cpython-310.pyc differ diff --git a/api/user/__pycache__/tasks.cpython-310.pyc b/api/user/__pycache__/tasks.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..550c3aea49a511d99a305537ad9f4c416a1b7812 Binary files /dev/null and b/api/user/__pycache__/tasks.cpython-310.pyc differ diff --git a/api/user/routes.py b/api/user/routes.py new file mode 100644 index 0000000000000000000000000000000000000000..867425721026c2642e3ed5d201f68f192b6add6e --- /dev/null +++ b/api/user/routes.py @@ -0,0 +1,321 @@ +import os +from flask import Blueprint, request, jsonify +import requests +from pymongo import MongoClient +from bson.objectid import ObjectId +from werkzeug.utils import secure_filename + +from application.api.user.tasks import ingest + +from application.core.settings import settings +from application.vectorstore.vector_creator import VectorCreator + +mongo = MongoClient(settings.MONGO_URI) +db = mongo["docsgpt"] +conversations_collection = db["conversations"] +vectors_collection = db["vectors"] +prompts_collection = db["prompts"] +feedback_collection = db["feedback"] +user = Blueprint('user', __name__) + +current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +@user.route("/api/delete_conversation", methods=["POST"]) +def delete_conversation(): + # deletes a conversation from the database + conversation_id = request.args.get("id") + # write to mongodb + conversations_collection.delete_one( + { + "_id": ObjectId(conversation_id), + } + ) + + return {"status": "ok"} + +@user.route("/api/get_conversations", methods=["get"]) +def get_conversations(): + # provides a list of conversations + conversations = conversations_collection.find().sort("date", -1) + list_conversations = [] + for conversation in conversations: + list_conversations.append({"id": str(conversation["_id"]), "name": conversation["name"]}) + + #list_conversations = [{"id": "default", "name": "default"}, {"id": "jeff", "name": "jeff"}] + + return jsonify(list_conversations) + + +@user.route("/api/get_single_conversation", methods=["get"]) +def get_single_conversation(): + # provides data for a conversation + conversation_id = request.args.get("id") + conversation = conversations_collection.find_one({"_id": ObjectId(conversation_id)}) + return jsonify(conversation['queries']) + +@user.route("/api/update_conversation_name", methods=["POST"]) +def update_conversation_name(): + # update data for a conversation + data = request.get_json() + id = data["id"] + name = data["name"] + conversations_collection.update_one({"_id": ObjectId(id)},{"$set":{"name":name}}) + return {"status": "ok"} + + +@user.route("/api/feedback", methods=["POST"]) +def api_feedback(): + data = request.get_json() + question = data["question"] + answer = data["answer"] + feedback = data["feedback"] + + + feedback_collection.insert_one( + { + "question": question, + "answer": answer, + "feedback": feedback, + } + ) + return {"status": "ok"} + +@user.route("/api/delete_by_ids", methods=["get"]) +def delete_by_ids(): + """Delete by ID. These are the IDs in the vectorstore""" + + ids = request.args.get("path") + if not ids: + return {"status": "error"} + + if settings.VECTOR_STORE == "faiss": + result = vectors_collection.delete_index(ids=ids) + if result: + return {"status": "ok"} + return {"status": "error"} + +@user.route("/api/delete_old", methods=["get"]) +def delete_old(): + """Delete old indexes.""" + import shutil + + path = request.args.get("path") + dirs = path.split("/") + dirs_clean = [] + for i in range(0, len(dirs)): + dirs_clean.append(secure_filename(dirs[i])) + # check that path strats with indexes or vectors + + if dirs_clean[0] not in ["indexes", "vectors"]: + return {"status": "error"} + path_clean = "/".join(dirs_clean) + vectors_collection.delete_one({"name": dirs_clean[-1], 'user': dirs_clean[-2]}) + if settings.VECTOR_STORE == "faiss": + try: + shutil.rmtree(os.path.join(current_dir, path_clean)) + except FileNotFoundError: + pass + else: + vetorstore = VectorCreator.create_vectorstore( + settings.VECTOR_STORE, path=os.path.join(current_dir, path_clean) + ) + vetorstore.delete_index() + + return {"status": "ok"} + +@user.route("/api/upload", methods=["POST"]) +def upload_file(): + """Upload a file to get vectorized and indexed.""" + if "user" not in request.form: + return {"status": "no user"} + user = secure_filename(request.form["user"]) + if "name" not in request.form: + return {"status": "no name"} + job_name = secure_filename(request.form["name"]) + # check if the post request has the file part + if "file" not in request.files: + print("No file part") + return {"status": "no file"} + file = request.files["file"] + if file.filename == "": + return {"status": "no file name"} + + if file: + filename = secure_filename(file.filename) + # save dir + save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name) + # create dir if not exists + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + file.save(os.path.join(save_dir, filename)) + task = ingest.delay(settings.UPLOAD_FOLDER, [".rst", ".md", ".pdf", ".txt", ".docx", + ".csv", ".epub", ".html", ".mdx"], + job_name, filename, user) + # task id + task_id = task.id + return {"status": "ok", "task_id": task_id} + else: + return {"status": "error"} + +@user.route("/api/task_status", methods=["GET"]) +def task_status(): + """Get celery job status.""" + task_id = request.args.get("task_id") + from application.celery import celery + task = celery.AsyncResult(task_id) + task_meta = task.info + return {"status": task.status, "result": task_meta} + + +@user.route("/api/combine", methods=["GET"]) +def combined_json(): + user = "local" + """Provide json file with combined available indexes.""" + # get json from https://d3dg1063dc54p9.cloudfront.net/combined.json + + data = [ + { + "name": "default", + "language": "default", + "version": "", + "description": "default", + "fullName": "default", + "date": "default", + "docLink": "default", + "model": settings.EMBEDDINGS_NAME, + "location": "remote", + } + ] + # structure: name, language, version, description, fullName, date, docLink + # append data from vectors_collection + for index in vectors_collection.find({"user": user}): + data.append( + { + "name": index["name"], + "language": index["language"], + "version": "", + "description": index["name"], + "fullName": index["name"], + "date": index["date"], + "docLink": index["location"], + "model": settings.EMBEDDINGS_NAME, + "location": "local", + } + ) + if settings.VECTOR_STORE == "faiss": + data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json() + for index in data_remote: + index["location"] = "remote" + data.append(index) + + return jsonify(data) + + +@user.route("/api/docs_check", methods=["POST"]) +def check_docs(): + # check if docs exist in a vectorstore folder + data = request.get_json() + # split docs on / and take first part + if data["docs"].split("/")[0] == "local": + return {"status": "exists"} + vectorstore = "vectors/" + data["docs"] + base_path = "https://raw.githubusercontent.com/arc53/DocsHUB/main/" + if os.path.exists(vectorstore) or data["docs"] == "default": + return {"status": "exists"} + else: + r = requests.get(base_path + vectorstore + "index.faiss") + + if r.status_code != 200: + return {"status": "null"} + else: + if not os.path.exists(vectorstore): + os.makedirs(vectorstore) + with open(vectorstore + "index.faiss", "wb") as f: + f.write(r.content) + + # download the store + r = requests.get(base_path + vectorstore + "index.pkl") + with open(vectorstore + "index.pkl", "wb") as f: + f.write(r.content) + + return {"status": "loaded"} + +@user.route("/api/create_prompt", methods=["POST"]) +def create_prompt(): + data = request.get_json() + content = data["content"] + name = data["name"] + if name == "": + return {"status": "error"} + user = "local" + resp = prompts_collection.insert_one( + { + "name": name, + "content": content, + "user": user, + } + ) + new_id = str(resp.inserted_id) + return {"id": new_id} + +@user.route("/api/get_prompts", methods=["GET"]) +def get_prompts(): + user = "local" + prompts = prompts_collection.find({"user": user}) + list_prompts = [] + list_prompts.append({"id": "default", "name": "default", "type": "public"}) + list_prompts.append({"id": "creative", "name": "creative", "type": "public"}) + list_prompts.append({"id": "strict", "name": "strict", "type": "public"}) + for prompt in prompts: + list_prompts.append({"id": str(prompt["_id"]), "name": prompt["name"], "type": "private"}) + + return jsonify(list_prompts) + +@user.route("/api/get_single_prompt", methods=["GET"]) +def get_single_prompt(): + prompt_id = request.args.get("id") + if prompt_id == 'default': + with open(os.path.join(current_dir, "prompts", "chat_combine_default.txt"), "r") as f: + chat_combine_template = f.read() + return jsonify({"content": chat_combine_template}) + elif prompt_id == 'creative': + with open(os.path.join(current_dir, "prompts", "chat_combine_creative.txt"), "r") as f: + chat_reduce_creative = f.read() + return jsonify({"content": chat_reduce_creative}) + elif prompt_id == 'strict': + with open(os.path.join(current_dir, "prompts", "chat_combine_strict.txt"), "r") as f: + chat_reduce_strict = f.read() + return jsonify({"content": chat_reduce_strict}) + + + prompt = prompts_collection.find_one({"_id": ObjectId(prompt_id)}) + return jsonify({"content": prompt["content"]}) + +@user.route("/api/delete_prompt", methods=["POST"]) +def delete_prompt(): + data = request.get_json() + id = data["id"] + prompts_collection.delete_one( + { + "_id": ObjectId(id), + } + ) + return {"status": "ok"} + +@user.route("/api/update_prompt", methods=["POST"]) +def update_prompt_name(): + data = request.get_json() + id = data["id"] + name = data["name"] + content = data["content"] + # check if name is null + if name == "": + return {"status": "error"} + prompts_collection.update_one({"_id": ObjectId(id)},{"$set":{"name":name, "content": content}}) + return {"status": "ok"} + + + + + diff --git a/api/user/tasks.py b/api/user/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..a3474939ab203bc860295d748f45dda9c2436135 --- /dev/null +++ b/api/user/tasks.py @@ -0,0 +1,7 @@ +from application.worker import ingest_worker +from application.celery import celery + +@celery.task(bind=True) +def ingest(self, directory, formats, name_job, filename, user): + resp = ingest_worker(self, directory, formats, name_job, filename, user) + return resp diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..ae619974974f13e59cdb6301fbe67515316aef51 --- /dev/null +++ b/app.py @@ -0,0 +1,44 @@ +import platform +import dotenv +from application.celery import celery +from flask import Flask, request, redirect +from application.core.settings import settings +from application.api.user.routes import user +from application.api.answer.routes import answer +from application.api.internal.routes import internal + +if platform.system() == "Windows": + import pathlib + pathlib.PosixPath = pathlib.WindowsPath + +dotenv.load_dotenv() + +app = Flask(__name__) +app.register_blueprint(user) +app.register_blueprint(answer) +app.register_blueprint(internal) +app.config.update( + UPLOAD_FOLDER="inputs", + CELERY_BROKER_URL=settings.CELERY_BROKER_URL, + CELERY_RESULT_BACKEND=settings.CELERY_RESULT_BACKEND, + MONGO_URI=settings.MONGO_URI +) +celery.config_from_object("application.celeryconfig") + +@app.route("/") +def home(): + if request.remote_addr in ('0.0.0.0', '127.0.0.1', 'localhost', '172.18.0.1'): + return redirect('http://localhost:5173') + else: + return 'Welcome to DocsGPT Backend!' + +@app.after_request +def after_request(response): + response.headers.add("Access-Control-Allow-Origin", "*") + response.headers.add("Access-Control-Allow-Headers", "Content-Type,Authorization") + response.headers.add("Access-Control-Allow-Methods", "GET,PUT,POST,DELETE,OPTIONS") + return response + +if __name__ == "__main__": + app.run(debug=True, port=7091) + diff --git a/celery.py b/celery.py new file mode 100644 index 0000000000000000000000000000000000000000..c19c2e75489300b0b95a70ebcc3162a07c1bd3f8 --- /dev/null +++ b/celery.py @@ -0,0 +1,9 @@ +from celery import Celery +from application.core.settings import settings + +def make_celery(app_name=__name__): + celery = Celery(app_name, broker=settings.CELERY_BROKER_URL, backend=settings.CELERY_RESULT_BACKEND) + celery.conf.update(settings) + return celery + +celery = make_celery() diff --git a/celeryconfig.py b/celeryconfig.py new file mode 100644 index 0000000000000000000000000000000000000000..712b3bfc6193b8015ec13c244601aea09abe39e6 --- /dev/null +++ b/celeryconfig.py @@ -0,0 +1,8 @@ +import os + +broker_url = os.getenv("CELERY_BROKER_URL") +result_backend = os.getenv("CELERY_RESULT_BACKEND") + +task_serializer = 'json' +result_serializer = 'json' +accept_content = ['json'] diff --git a/core/__init__.py b/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/core/__pycache__/__init__.cpython-310.pyc b/core/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ab79ff8589d8887c6293676b263e94ebff7bb94 Binary files /dev/null and b/core/__pycache__/__init__.cpython-310.pyc differ diff --git a/core/__pycache__/settings.cpython-310.pyc b/core/__pycache__/settings.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28afe965994ff14490537514d570e893a8d8f57c Binary files /dev/null and b/core/__pycache__/settings.cpython-310.pyc differ diff --git a/core/settings.py b/core/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..42dea0ff6adebb1a4787f4228f6c015b922eacff --- /dev/null +++ b/core/settings.py @@ -0,0 +1,44 @@ +from pathlib import Path +from typing import Optional +import os + +from pydantic_settings import BaseSettings +current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +class Settings(BaseSettings): + LLM_NAME: str = "docsgpt" + EMBEDDINGS_NAME: str = "huggingface_sentence-transformers/all-mpnet-base-v2" + CELERY_BROKER_URL: str = "redis://localhost:6379/0" + CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1" + MONGO_URI: str = "mongodb://localhost:27017/docsgpt" + MODEL_PATH: str = os.path.join(current_dir, "models/docsgpt-7b-f16.gguf") + TOKENS_MAX_HISTORY: int = 150 + UPLOAD_FOLDER: str = "inputs" + VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" + + API_URL: str = "http://localhost:7091" # backend url for celery worker + + API_KEY: Optional[str] = None # LLM api key + EMBEDDINGS_KEY: Optional[str] = None # api key for embeddings (if using openai, just copy API_KEY) + OPENAI_API_BASE: Optional[str] = None # azure openai api base url + OPENAI_API_VERSION: Optional[str] = None # azure openai api version + AZURE_DEPLOYMENT_NAME: Optional[str] = None # azure deployment name for answering + AZURE_EMBEDDINGS_DEPLOYMENT_NAME: Optional[str] = None # azure deployment name for embeddings + + # elasticsearch + ELASTIC_CLOUD_ID: Optional[str] = None # cloud id for elasticsearch + ELASTIC_USERNAME: Optional[str] = None # username for elasticsearch + ELASTIC_PASSWORD: Optional[str] = None # password for elasticsearch + ELASTIC_URL: Optional[str] = None # url for elasticsearch + ELASTIC_INDEX: Optional[str] = "docsgpt" # index name for elasticsearch + + # SageMaker config + SAGEMAKER_ENDPOINT: Optional[str] = None # SageMaker endpoint name + SAGEMAKER_REGION: Optional[str] = None # SageMaker region name + SAGEMAKER_ACCESS_KEY: Optional[str] = None # SageMaker access key + SAGEMAKER_SECRET_KEY: Optional[str] = None # SageMaker secret key + + +path = Path(__file__).parent.parent.absolute() +settings = Settings(_env_file=path.joinpath(".env"), _env_file_encoding="utf-8") diff --git a/error.py b/error.py new file mode 100644 index 0000000000000000000000000000000000000000..5d42f0ee412e5ec68b751fd8bf81c63bb9d3de3c --- /dev/null +++ b/error.py @@ -0,0 +1,15 @@ +from flask import jsonify +from werkzeug.http import HTTP_STATUS_CODES + + +def response_error(code_status, message=None): + payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")} + if message: + payload['message'] = message + response = jsonify(payload) + response.status_code = code_status + return response + + +def bad_request(status_code=400, message=''): + return response_error(code_status=status_code, message=message) diff --git a/index.faiss b/index.faiss new file mode 100644 index 0000000000000000000000000000000000000000..4e3784e9cd1f2594e882a8db28e010110fb43386 Binary files /dev/null and b/index.faiss differ diff --git a/index.pkl b/index.pkl new file mode 100644 index 0000000000000000000000000000000000000000..304ee3181bbea0793355c78913eead0181d6f446 --- /dev/null +++ b/index.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1653826159295b5a262df5228ec9678a919a9fcc3ff94248eeaa55f434c071ef +size 7866 diff --git a/indexes/local/patil2016.pdf/index.faiss b/indexes/local/patil2016.pdf/index.faiss new file mode 100644 index 0000000000000000000000000000000000000000..6d4f04266c6c377f3b5d8a0de6a31c83f68b1a73 Binary files /dev/null and b/indexes/local/patil2016.pdf/index.faiss differ diff --git a/indexes/local/patil2016.pdf/index.pkl b/indexes/local/patil2016.pdf/index.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5a23a4ef29b5fb62ff9edfd25cbcb98c7e3da24b --- /dev/null +++ b/indexes/local/patil2016.pdf/index.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc1aa0edd32b66234b113edba42b67f5fc498851e584863124f44abf3920273 +size 28255 diff --git a/inputs/local/patil2016.pdf/patil2016.pdf b/inputs/local/patil2016.pdf/patil2016.pdf new file mode 100644 index 0000000000000000000000000000000000000000..fce6d78d67c3f1260ee1155e4b60f326bb26e5ea Binary files /dev/null and b/inputs/local/patil2016.pdf/patil2016.pdf differ diff --git a/llm/__init__.py b/llm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llm/__pycache__/__init__.cpython-310.pyc b/llm/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf95a2c895ccec491633590295ec5b840b4087d7 Binary files /dev/null and b/llm/__pycache__/__init__.cpython-310.pyc differ diff --git a/llm/__pycache__/anthropic.cpython-310.pyc b/llm/__pycache__/anthropic.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0d12bf1e79a812acf88cde898922184bb68cb81 Binary files /dev/null and b/llm/__pycache__/anthropic.cpython-310.pyc differ diff --git a/llm/__pycache__/base.cpython-310.pyc b/llm/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7c4d8a394a511ac201a96c81c01c22c0923309e Binary files /dev/null and b/llm/__pycache__/base.cpython-310.pyc differ diff --git a/llm/__pycache__/docsgpt_provider.cpython-310.pyc b/llm/__pycache__/docsgpt_provider.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..918460b87abc24a5c3fc25e6b48579126e7f6ded Binary files /dev/null and b/llm/__pycache__/docsgpt_provider.cpython-310.pyc differ diff --git a/llm/__pycache__/huggingface.cpython-310.pyc b/llm/__pycache__/huggingface.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9881b678dc75efe05e623badc946d44e8199517 Binary files /dev/null and b/llm/__pycache__/huggingface.cpython-310.pyc differ diff --git a/llm/__pycache__/llama_cpp.cpython-310.pyc b/llm/__pycache__/llama_cpp.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..485858bdfd5bd3d62d1e06bb828d4a4a9bb6873d Binary files /dev/null and b/llm/__pycache__/llama_cpp.cpython-310.pyc differ diff --git a/llm/__pycache__/llm_creator.cpython-310.pyc b/llm/__pycache__/llm_creator.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d75deaaf41cc477f53efa4d3eb39dee4cf3f58ca Binary files /dev/null and b/llm/__pycache__/llm_creator.cpython-310.pyc differ diff --git a/llm/__pycache__/openai.cpython-310.pyc b/llm/__pycache__/openai.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca423823f4794bcf61e2b15b975649328b457abe Binary files /dev/null and b/llm/__pycache__/openai.cpython-310.pyc differ diff --git a/llm/__pycache__/sagemaker.cpython-310.pyc b/llm/__pycache__/sagemaker.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7eb1403df23815af82fc85c1930b2ef20c94449 Binary files /dev/null and b/llm/__pycache__/sagemaker.cpython-310.pyc differ diff --git a/llm/anthropic.py b/llm/anthropic.py new file mode 100644 index 0000000000000000000000000000000000000000..a64d71e937fe5b7e25021222139588faab9e760a --- /dev/null +++ b/llm/anthropic.py @@ -0,0 +1,40 @@ +from application.llm.base import BaseLLM +from application.core.settings import settings + +class AnthropicLLM(BaseLLM): + + def __init__(self, api_key=None): + from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT + self.api_key = api_key or settings.ANTHROPIC_API_KEY # If not provided, use a default from settings + self.anthropic = Anthropic(api_key=self.api_key) + self.HUMAN_PROMPT = HUMAN_PROMPT + self.AI_PROMPT = AI_PROMPT + + def gen(self, model, messages, engine=None, max_tokens=300, stream=False, **kwargs): + context = messages[0]['content'] + user_question = messages[-1]['content'] + prompt = f"### Context \n {context} \n ### Question \n {user_question}" + if stream: + return self.gen_stream(model, prompt, max_tokens, **kwargs) + + completion = self.anthropic.completions.create( + model=model, + max_tokens_to_sample=max_tokens, + stream=stream, + prompt=f"{self.HUMAN_PROMPT} {prompt}{self.AI_PROMPT}", + ) + return completion.completion + + def gen_stream(self, model, messages, engine=None, max_tokens=300, **kwargs): + context = messages[0]['content'] + user_question = messages[-1]['content'] + prompt = f"### Context \n {context} \n ### Question \n {user_question}" + stream_response = self.anthropic.completions.create( + model=model, + prompt=f"{self.HUMAN_PROMPT} {prompt}{self.AI_PROMPT}", + max_tokens_to_sample=max_tokens, + stream=True, + ) + + for completion in stream_response: + yield completion.completion \ No newline at end of file diff --git a/llm/base.py b/llm/base.py new file mode 100644 index 0000000000000000000000000000000000000000..e08a3b090950d55d35a749a117ec29f4e00e5d4e --- /dev/null +++ b/llm/base.py @@ -0,0 +1,14 @@ +from abc import ABC, abstractmethod + + +class BaseLLM(ABC): + def __init__(self): + pass + + @abstractmethod + def gen(self, *args, **kwargs): + pass + + @abstractmethod + def gen_stream(self, *args, **kwargs): + pass diff --git a/llm/docsgpt_provider.py b/llm/docsgpt_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..b7d6a5ad570902c36809c5e5fedd0f390c376922 --- /dev/null +++ b/llm/docsgpt_provider.py @@ -0,0 +1,49 @@ +from application.llm.base import BaseLLM +import json +import requests + +class DocsGPTAPILLM(BaseLLM): + + def __init__(self, *args, **kwargs): + self.endpoint = "https://llm.docsgpt.co.uk" + + + def gen(self, model, engine, messages, stream=False, **kwargs): + context = messages[0]['content'] + user_question = messages[-1]['content'] + prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" + + response = requests.post( + f"{self.endpoint}/answer", + json={ + "prompt": prompt, + "max_new_tokens": 30 + } + ) + response_clean = response.json()['a'].split("###")[0] + + return response_clean + + def gen_stream(self, model, engine, messages, stream=True, **kwargs): + context = messages[0]['content'] + user_question = messages[-1]['content'] + prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" + + # send prompt to endpoint /stream + response = requests.post( + f"{self.endpoint}/stream", + json={ + "prompt": prompt, + "max_new_tokens": 256 + }, + stream=True + ) + + for line in response.iter_lines(): + if line: + #data = json.loads(line) + data_str = line.decode('utf-8') + if data_str.startswith("data: "): + data = json.loads(data_str[6:]) + yield data['a'] + \ No newline at end of file diff --git a/llm/huggingface.py b/llm/huggingface.py new file mode 100644 index 0000000000000000000000000000000000000000..ef3b1fbceeeb01e62c94077c477ead555a18eb06 --- /dev/null +++ b/llm/huggingface.py @@ -0,0 +1,44 @@ +from application.llm.base import BaseLLM + +class HuggingFaceLLM(BaseLLM): + + def __init__(self, api_key, llm_name='Arc53/DocsGPT-7B',q=False): + global hf + + from langchain.llms import HuggingFacePipeline + if q: + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig + tokenizer = AutoTokenizer.from_pretrained(llm_name) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16 + ) + model = AutoModelForCausalLM.from_pretrained(llm_name,quantization_config=bnb_config) + else: + from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline + tokenizer = AutoTokenizer.from_pretrained(llm_name) + model = AutoModelForCausalLM.from_pretrained(llm_name) + + pipe = pipeline( + "text-generation", model=model, + tokenizer=tokenizer, max_new_tokens=2000, + device_map="auto", eos_token_id=tokenizer.eos_token_id + ) + hf = HuggingFacePipeline(pipeline=pipe) + + def gen(self, model, engine, messages, stream=False, **kwargs): + context = messages[0]['content'] + user_question = messages[-1]['content'] + prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" + + result = hf(prompt) + + return result.content + + def gen_stream(self, model, engine, messages, stream=True, **kwargs): + + raise NotImplementedError("HuggingFaceLLM Streaming is not implemented yet.") + diff --git a/llm/llama_cpp.py b/llm/llama_cpp.py new file mode 100644 index 0000000000000000000000000000000000000000..f18d4379ee6dadc5a394f320a2e88c58994cd488 --- /dev/null +++ b/llm/llama_cpp.py @@ -0,0 +1,39 @@ +from application.llm.base import BaseLLM +from application.core.settings import settings + +class LlamaCpp(BaseLLM): + + def __init__(self, api_key, llm_name=settings.MODEL_PATH, **kwargs): + global llama + try: + from llama_cpp import Llama + except ImportError: + raise ImportError("Please install llama_cpp using pip install llama-cpp-python") + + llama = Llama(model_path=llm_name, n_ctx=2048) + + def gen(self, model, engine, messages, stream=False, **kwargs): + context = messages[0]['content'] + user_question = messages[-1]['content'] + prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" + + result = llama(prompt, max_tokens=150, echo=False) + + # import sys + # print(result['choices'][0]['text'].split('### Answer \n')[-1], file=sys.stderr) + + return result['choices'][0]['text'].split('### Answer \n')[-1] + + def gen_stream(self, model, engine, messages, stream=True, **kwargs): + context = messages[0]['content'] + user_question = messages[-1]['content'] + prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" + + result = llama(prompt, max_tokens=150, echo=False, stream=stream) + + # import sys + # print(list(result), file=sys.stderr) + + for item in result: + for choice in item['choices']: + yield choice['text'] diff --git a/llm/llm_creator.py b/llm/llm_creator.py new file mode 100644 index 0000000000000000000000000000000000000000..d0d6ae3f8d8d218627d66f043f574e386346fc3f --- /dev/null +++ b/llm/llm_creator.py @@ -0,0 +1,26 @@ +from application.llm.openai import OpenAILLM, AzureOpenAILLM +from application.llm.sagemaker import SagemakerAPILLM +from application.llm.huggingface import HuggingFaceLLM +from application.llm.llama_cpp import LlamaCpp +from application.llm.anthropic import AnthropicLLM +from application.llm.docsgpt_provider import DocsGPTAPILLM + + + +class LLMCreator: + llms = { + 'openai': OpenAILLM, + 'azure_openai': AzureOpenAILLM, + 'sagemaker': SagemakerAPILLM, + 'huggingface': HuggingFaceLLM, + 'llama.cpp': LlamaCpp, + 'anthropic': AnthropicLLM, + 'docsgpt': DocsGPTAPILLM + } + + @classmethod + def create_llm(cls, type, *args, **kwargs): + llm_class = cls.llms.get(type.lower()) + if not llm_class: + raise ValueError(f"No LLM class found for type {type}") + return llm_class(*args, **kwargs) \ No newline at end of file diff --git a/llm/openai.py b/llm/openai.py new file mode 100644 index 0000000000000000000000000000000000000000..a132399aacdf899d44ed8de55a37ac31ecc23a5e --- /dev/null +++ b/llm/openai.py @@ -0,0 +1,60 @@ +from application.llm.base import BaseLLM +from application.core.settings import settings + +class OpenAILLM(BaseLLM): + + def __init__(self, api_key): + global openai + from openai import OpenAI + + self.client = OpenAI( + api_key=api_key, + ) + self.api_key = api_key + + def _get_openai(self): + # Import openai when needed + import openai + + return openai + + def gen(self, model, engine, messages, stream=False, **kwargs): + response = self.client.chat.completions.create(model=model, + messages=messages, + stream=stream, + **kwargs) + + return response.choices[0].message.content + + def gen_stream(self, model, engine, messages, stream=True, **kwargs): + response = self.client.chat.completions.create(model=model, + messages=messages, + stream=stream, + **kwargs) + + for line in response: + # import sys + # print(line.choices[0].delta.content, file=sys.stderr) + if line.choices[0].delta.content is not None: + yield line.choices[0].delta.content + + +class AzureOpenAILLM(OpenAILLM): + + def __init__(self, openai_api_key, openai_api_base, openai_api_version, deployment_name): + super().__init__(openai_api_key) + self.api_base = settings.OPENAI_API_BASE, + self.api_version = settings.OPENAI_API_VERSION, + self.deployment_name = settings.AZURE_DEPLOYMENT_NAME, + from openai import AzureOpenAI + self.client = AzureOpenAI( + api_key=openai_api_key, + api_version=settings.OPENAI_API_VERSION, + api_base=settings.OPENAI_API_BASE, + deployment_name=settings.AZURE_DEPLOYMENT_NAME, + ) + + def _get_openai(self): + openai = super()._get_openai() + + return openai diff --git a/llm/sagemaker.py b/llm/sagemaker.py new file mode 100644 index 0000000000000000000000000000000000000000..84ae09adf99647c477f31268abafc3ffb6c16850 --- /dev/null +++ b/llm/sagemaker.py @@ -0,0 +1,139 @@ +from application.llm.base import BaseLLM +from application.core.settings import settings +import json +import io + + + +class LineIterator: + """ + A helper class for parsing the byte stream input. + + The output of the model will be in the following format: + ``` + b'{"outputs": [" a"]}\n' + b'{"outputs": [" challenging"]}\n' + b'{"outputs": [" problem"]}\n' + ... + ``` + + While usually each PayloadPart event from the event stream will contain a byte array + with a full json, this is not guaranteed and some of the json objects may be split across + PayloadPart events. For example: + ``` + {'PayloadPart': {'Bytes': b'{"outputs": '}} + {'PayloadPart': {'Bytes': b'[" problem"]}\n'}} + ``` + + This class accounts for this by concatenating bytes written via the 'write' function + and then exposing a method which will return lines (ending with a '\n' character) within + the buffer via the 'scan_lines' function. It maintains the position of the last read + position to ensure that previous bytes are not exposed again. + """ + + def __init__(self, stream): + self.byte_iterator = iter(stream) + self.buffer = io.BytesIO() + self.read_pos = 0 + + def __iter__(self): + return self + + def __next__(self): + while True: + self.buffer.seek(self.read_pos) + line = self.buffer.readline() + if line and line[-1] == ord('\n'): + self.read_pos += len(line) + return line[:-1] + try: + chunk = next(self.byte_iterator) + except StopIteration: + if self.read_pos < self.buffer.getbuffer().nbytes: + continue + raise + if 'PayloadPart' not in chunk: + print('Unknown event type:' + chunk) + continue + self.buffer.seek(0, io.SEEK_END) + self.buffer.write(chunk['PayloadPart']['Bytes']) + +class SagemakerAPILLM(BaseLLM): + + def __init__(self, *args, **kwargs): + import boto3 + runtime = boto3.client( + 'runtime.sagemaker', + aws_access_key_id='xxx', + aws_secret_access_key='xxx', + region_name='us-west-2' + ) + + + self.endpoint = settings.SAGEMAKER_ENDPOINT + self.runtime = runtime + + + def gen(self, model, engine, messages, stream=False, **kwargs): + context = messages[0]['content'] + user_question = messages[-1]['content'] + prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" + + + # Construct payload for endpoint + payload = { + "inputs": prompt, + "stream": False, + "parameters": { + "do_sample": True, + "temperature": 0.1, + "max_new_tokens": 30, + "repetition_penalty": 1.03, + "stop": ["", "###"] + } + } + body_bytes = json.dumps(payload).encode('utf-8') + + # Invoke the endpoint + response = self.runtime.invoke_endpoint(EndpointName=self.endpoint, + ContentType='application/json', + Body=body_bytes) + result = json.loads(response['Body'].read().decode()) + import sys + print(result[0]['generated_text'], file=sys.stderr) + return result[0]['generated_text'][len(prompt):] + + def gen_stream(self, model, engine, messages, stream=True, **kwargs): + context = messages[0]['content'] + user_question = messages[-1]['content'] + prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" + + + # Construct payload for endpoint + payload = { + "inputs": prompt, + "stream": True, + "parameters": { + "do_sample": True, + "temperature": 0.1, + "max_new_tokens": 512, + "repetition_penalty": 1.03, + "stop": ["", "###"] + } + } + body_bytes = json.dumps(payload).encode('utf-8') + + # Invoke the endpoint + response = self.runtime.invoke_endpoint_with_response_stream(EndpointName=self.endpoint, + ContentType='application/json', + Body=body_bytes) + #result = json.loads(response['Body'].read().decode()) + event_stream = response['Body'] + start_json = b'{' + for line in LineIterator(event_stream): + if line != b'' and start_json in line: + #print(line) + data = json.loads(line[line.find(start_json):].decode('utf-8')) + if data['token']['text'] not in ["", "###"]: + print(data['token']['text'],end='') + yield data['token']['text'] \ No newline at end of file diff --git a/parser/__init__.py b/parser/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/parser/__init__.py @@ -0,0 +1 @@ + diff --git a/parser/__pycache__/__init__.cpython-310.pyc b/parser/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d7b8485471e114891e5b1e58b1a48421b72a1e6 Binary files /dev/null and b/parser/__pycache__/__init__.cpython-310.pyc differ diff --git a/parser/__pycache__/open_ai_func.cpython-310.pyc b/parser/__pycache__/open_ai_func.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..167143959e69e6b1324a0f16e9925adccda6d84b Binary files /dev/null and b/parser/__pycache__/open_ai_func.cpython-310.pyc differ diff --git a/parser/__pycache__/token_func.cpython-310.pyc b/parser/__pycache__/token_func.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b33a007c639722650171b4ef90f7cc449d2e930b Binary files /dev/null and b/parser/__pycache__/token_func.cpython-310.pyc differ diff --git a/parser/file/__init__.py b/parser/file/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/parser/file/__init__.py @@ -0,0 +1 @@ + diff --git a/parser/file/__pycache__/__init__.cpython-310.pyc b/parser/file/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e3267074255ccff8679245c2768a70080a3699b8 Binary files /dev/null and b/parser/file/__pycache__/__init__.cpython-310.pyc differ diff --git a/parser/file/__pycache__/base.cpython-310.pyc b/parser/file/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d758673efacb4cf0ac063108bd5ba63cf42cfbee Binary files /dev/null and b/parser/file/__pycache__/base.cpython-310.pyc differ diff --git a/parser/file/__pycache__/base_parser.cpython-310.pyc b/parser/file/__pycache__/base_parser.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c6cd9521184d12059d0b565cad01b365c9dabf4 Binary files /dev/null and b/parser/file/__pycache__/base_parser.cpython-310.pyc differ diff --git a/parser/file/__pycache__/bulk.cpython-310.pyc b/parser/file/__pycache__/bulk.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ccee0317291b96f53ef271db926501010d57d59b Binary files /dev/null and b/parser/file/__pycache__/bulk.cpython-310.pyc differ diff --git a/parser/file/__pycache__/docs_parser.cpython-310.pyc b/parser/file/__pycache__/docs_parser.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70684d8a015fecec8ab6ac76eda6a9b7a2526678 Binary files /dev/null and b/parser/file/__pycache__/docs_parser.cpython-310.pyc differ diff --git a/parser/file/__pycache__/epub_parser.cpython-310.pyc b/parser/file/__pycache__/epub_parser.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28e629b1e75cd5bb18a0ce91e28b5aea0bb80116 Binary files /dev/null and b/parser/file/__pycache__/epub_parser.cpython-310.pyc differ diff --git a/parser/file/__pycache__/html_parser.cpython-310.pyc b/parser/file/__pycache__/html_parser.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ccb70083c45807d9e7b488b779fa1befdfce3217 Binary files /dev/null and b/parser/file/__pycache__/html_parser.cpython-310.pyc differ diff --git a/parser/file/__pycache__/markdown_parser.cpython-310.pyc b/parser/file/__pycache__/markdown_parser.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ee52f8bb3b4d7f2f40a9689176adb35c459a169 Binary files /dev/null and b/parser/file/__pycache__/markdown_parser.cpython-310.pyc differ diff --git a/parser/file/__pycache__/rst_parser.cpython-310.pyc b/parser/file/__pycache__/rst_parser.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b4bc7dd426c5fbfdc5255aa776d6476fdb96321 Binary files /dev/null and b/parser/file/__pycache__/rst_parser.cpython-310.pyc differ diff --git a/parser/file/__pycache__/tabular_parser.cpython-310.pyc b/parser/file/__pycache__/tabular_parser.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f024193bfb916a3b6f725a3ecb997cf1e1567037 Binary files /dev/null and b/parser/file/__pycache__/tabular_parser.cpython-310.pyc differ diff --git a/parser/file/base.py b/parser/file/base.py new file mode 100644 index 0000000000000000000000000000000000000000..f63e8ef64d21208d886c4c6d52c1d6e097609e3e --- /dev/null +++ b/parser/file/base.py @@ -0,0 +1,19 @@ +"""Base reader class.""" +from abc import abstractmethod +from typing import Any, List + +from langchain.docstore.document import Document as LCDocument +from application.parser.schema.base import Document + + +class BaseReader: + """Utilities for loading data from a directory.""" + + @abstractmethod + def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]: + """Load data from the input directory.""" + + def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]: + """Load data in LangChain document format.""" + docs = self.load_data(**load_kwargs) + return [d.to_langchain_format() for d in docs] diff --git a/parser/file/base_parser.py b/parser/file/base_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..753a56f9797432f053fb96a72bdb782a2b20bd05 --- /dev/null +++ b/parser/file/base_parser.py @@ -0,0 +1,38 @@ +"""Base parser and config class.""" + +from abc import abstractmethod +from pathlib import Path +from typing import Dict, List, Optional, Union + + +class BaseParser: + """Base class for all parsers.""" + + def __init__(self, parser_config: Optional[Dict] = None): + """Init params.""" + self._parser_config = parser_config + + def init_parser(self) -> None: + """Init parser and store it.""" + parser_config = self._init_parser() + self._parser_config = parser_config + + @property + def parser_config_set(self) -> bool: + """Check if parser config is set.""" + return self._parser_config is not None + + @property + def parser_config(self) -> Dict: + """Check if parser config is set.""" + if self._parser_config is None: + raise ValueError("Parser config not set.") + return self._parser_config + + @abstractmethod + def _init_parser(self) -> Dict: + """Initialize the parser with the config.""" + + @abstractmethod + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: + """Parse file.""" diff --git a/parser/file/bulk.py b/parser/file/bulk.py new file mode 100644 index 0000000000000000000000000000000000000000..593681e29b2bf109cfe5e5cd21476d71e1e44412 --- /dev/null +++ b/parser/file/bulk.py @@ -0,0 +1,163 @@ +"""Simple reader that reads files of different formats from a directory.""" +import logging +from pathlib import Path +from typing import Callable, Dict, List, Optional, Union + +from application.parser.file.base import BaseReader +from application.parser.file.base_parser import BaseParser +from application.parser.file.docs_parser import DocxParser, PDFParser +from application.parser.file.epub_parser import EpubParser +from application.parser.file.html_parser import HTMLParser +from application.parser.file.markdown_parser import MarkdownParser +from application.parser.file.rst_parser import RstParser +from application.parser.file.tabular_parser import PandasCSVParser +from application.parser.schema.base import Document + +DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { + ".pdf": PDFParser(), + ".docx": DocxParser(), + ".csv": PandasCSVParser(), + ".epub": EpubParser(), + ".md": MarkdownParser(), + ".rst": RstParser(), + ".html": HTMLParser(), + ".mdx": MarkdownParser(), +} + + +class SimpleDirectoryReader(BaseReader): + """Simple directory reader. + + Can read files into separate documents, or concatenates + files into one document text. + + Args: + input_dir (str): Path to the directory. + input_files (List): List of file paths to read (Optional; overrides input_dir) + exclude_hidden (bool): Whether to exclude hidden files (dotfiles). + errors (str): how encoding and decoding errors are to be handled, + see https://docs.python.org/3/library/functions.html#open + recursive (bool): Whether to recursively search in subdirectories. + False by default. + required_exts (Optional[List[str]]): List of required extensions. + Default is None. + file_extractor (Optional[Dict[str, BaseParser]]): A mapping of file + extension to a BaseParser class that specifies how to convert that file + to text. See DEFAULT_FILE_EXTRACTOR. + num_files_limit (Optional[int]): Maximum number of files to read. + Default is None. + file_metadata (Optional[Callable[str, Dict]]): A function that takes + in a filename and returns a Dict of metadata for the Document. + Default is None. + """ + + def __init__( + self, + input_dir: Optional[str] = None, + input_files: Optional[List] = None, + exclude_hidden: bool = True, + errors: str = "ignore", + recursive: bool = True, + required_exts: Optional[List[str]] = None, + file_extractor: Optional[Dict[str, BaseParser]] = None, + num_files_limit: Optional[int] = None, + file_metadata: Optional[Callable[[str], Dict]] = None, + chunk_size_max: int = 2048, + ) -> None: + """Initialize with parameters.""" + super().__init__() + + if not input_dir and not input_files: + raise ValueError("Must provide either `input_dir` or `input_files`.") + + self.errors = errors + + self.recursive = recursive + self.exclude_hidden = exclude_hidden + self.required_exts = required_exts + self.num_files_limit = num_files_limit + + if input_files: + self.input_files = [] + for path in input_files: + print(path) + input_file = Path(path) + self.input_files.append(input_file) + elif input_dir: + self.input_dir = Path(input_dir) + self.input_files = self._add_files(self.input_dir) + + self.file_extractor = file_extractor or DEFAULT_FILE_EXTRACTOR + self.file_metadata = file_metadata + + def _add_files(self, input_dir: Path) -> List[Path]: + """Add files.""" + input_files = sorted(input_dir.iterdir()) + new_input_files = [] + dirs_to_explore = [] + for input_file in input_files: + if input_file.is_dir(): + if self.recursive: + dirs_to_explore.append(input_file) + elif self.exclude_hidden and input_file.name.startswith("."): + continue + elif ( + self.required_exts is not None + and input_file.suffix not in self.required_exts + ): + continue + else: + new_input_files.append(input_file) + + for dir_to_explore in dirs_to_explore: + sub_input_files = self._add_files(dir_to_explore) + new_input_files.extend(sub_input_files) + + if self.num_files_limit is not None and self.num_files_limit > 0: + new_input_files = new_input_files[0: self.num_files_limit] + + # print total number of files added + logging.debug( + f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}" + ) + + return new_input_files + + def load_data(self, concatenate: bool = False) -> List[Document]: + """Load data from the input directory. + + Args: + concatenate (bool): whether to concatenate all files into one document. + If set to True, file metadata is ignored. + False by default. + + Returns: + List[Document]: A list of documents. + + """ + data: Union[str, List[str]] = "" + data_list: List[str] = [] + metadata_list = [] + for input_file in self.input_files: + if input_file.suffix in self.file_extractor: + parser = self.file_extractor[input_file.suffix] + if not parser.parser_config_set: + parser.init_parser() + data = parser.parse_file(input_file, errors=self.errors) + else: + # do standard read + with open(input_file, "r", errors=self.errors) as f: + data = f.read() + if isinstance(data, List): + data_list.extend(data) + else: + data_list.append(str(data)) + if self.file_metadata is not None: + metadata_list.append(self.file_metadata(str(input_file))) + + if concatenate: + return [Document("\n".join(data_list))] + elif self.file_metadata is not None: + return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)] + else: + return [Document(d) for d in data_list] diff --git a/parser/file/docs_parser.py b/parser/file/docs_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..861e8e5899935049f460799abbafd60c13164e34 --- /dev/null +++ b/parser/file/docs_parser.py @@ -0,0 +1,59 @@ +"""Docs parser. + +Contains parsers for docx, pdf files. + +""" +from pathlib import Path +from typing import Dict + +from application.parser.file.base_parser import BaseParser + + +class PDFParser(BaseParser): + """PDF parser.""" + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> str: + """Parse file.""" + try: + import PyPDF2 + except ImportError: + raise ValueError("PyPDF2 is required to read PDF files.") + text_list = [] + with open(file, "rb") as fp: + # Create a PDF object + pdf = PyPDF2.PdfReader(fp) + + # Get the number of pages in the PDF document + num_pages = len(pdf.pages) + + # Iterate over every page + for page in range(num_pages): + # Extract the text from the page + page_text = pdf.pages[page].extract_text() + text_list.append(page_text) + text = "\n".join(text_list) + + return text + + +class DocxParser(BaseParser): + """Docx parser.""" + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> str: + """Parse file.""" + try: + import docx2txt + except ImportError: + raise ValueError("docx2txt is required to read Microsoft Word files.") + + text = docx2txt.process(file) + + return text diff --git a/parser/file/epub_parser.py b/parser/file/epub_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..4f5e87115a082981328adcb838c0a361fd4733bf --- /dev/null +++ b/parser/file/epub_parser.py @@ -0,0 +1,43 @@ +"""Epub parser. + +Contains parsers for epub files. +""" + +from pathlib import Path +from typing import Dict + +from application.parser.file.base_parser import BaseParser + + +class EpubParser(BaseParser): + """Epub Parser.""" + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> str: + """Parse file.""" + try: + import ebooklib + from ebooklib import epub + except ImportError: + raise ValueError("`EbookLib` is required to read Epub files.") + try: + import html2text + except ImportError: + raise ValueError("`html2text` is required to parse Epub files.") + + text_list = [] + book = epub.read_epub(file, options={"ignore_ncx": True}) + + # Iterate through all chapters. + for item in book.get_items(): + # Chapters are typically located in epub documents items. + if item.get_type() == ebooklib.ITEM_DOCUMENT: + text_list.append( + html2text.html2text(item.get_content().decode("utf-8")) + ) + + text = "\n".join(text_list) + return text diff --git a/parser/file/html_parser.py b/parser/file/html_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..2a74a97a53a67f9eeb26d621ef2ff690a1315c7b --- /dev/null +++ b/parser/file/html_parser.py @@ -0,0 +1,83 @@ +"""HTML parser. + +Contains parser for html files. + +""" +import re +from pathlib import Path +from typing import Dict, Union + +from application.parser.file.base_parser import BaseParser + + +class HTMLParser(BaseParser): + """HTML parser.""" + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]: + """Parse file. + + Returns: + Union[str, List[str]]: a string or a List of strings. + """ + try: + from unstructured.partition.html import partition_html + from unstructured.staging.base import convert_to_isd + from unstructured.cleaners.core import clean + except ImportError: + raise ValueError("unstructured package is required to parse HTML files.") + + # Using the unstructured library to convert the html to isd format + # isd sample : isd = [ + # {"text": "My Title", "type": "Title"}, + # {"text": "My Narrative", "type": "NarrativeText"} + # ] + with open(file, "r", encoding="utf-8") as fp: + elements = partition_html(file=fp) + isd = convert_to_isd(elements) + + # Removing non ascii charactwers from isd_el['text'] + for isd_el in isd: + isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode() + + # Removing all the \n characters from isd_el['text'] using regex and replace with single space + # Removing all the extra spaces from isd_el['text'] using regex and replace with single space + for isd_el in isd: + isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL) + isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL) + + # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation + for isd_el in isd: + clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True) + + # Creating a list of all the indexes of isd_el['type'] = 'Title' + title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title'] + + # Creating 'Chunks' - List of lists of strings + # each list starting with isd_el['type'] = 'Title' and all the data till the next 'Title' + # Each Chunk can be thought of as an individual set of data, which can be sent to the model + # Where Each Title is grouped together with the data under it + + Chunks = [[]] + final_chunks = list(list()) + + for i, isd_el in enumerate(isd): + if i in title_indexes: + Chunks.append([]) + Chunks[-1].append(isd_el['text']) + + # Removing all the chunks with sum of length of all the strings in the chunk < 25 + # TODO: This value can be an user defined variable + for chunk in Chunks: + # sum of length of all the strings in the chunk + sum = 0 + sum += len(str(chunk)) + if sum < 25: + Chunks.remove(chunk) + else: + # appending all the approved chunks to final_chunks as a single string + final_chunks.append(" ".join([str(item) for item in chunk])) + return final_chunks diff --git a/parser/file/markdown_parser.py b/parser/file/markdown_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..d906e9b6c1d8fe41386e39b8ac7cb20f358e8501 --- /dev/null +++ b/parser/file/markdown_parser.py @@ -0,0 +1,145 @@ +"""Markdown parser. + +Contains parser for md files. + +""" +import re +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union, cast + +import tiktoken +from application.parser.file.base_parser import BaseParser + + +class MarkdownParser(BaseParser): + """Markdown parser. + + Extract text from markdown files. + Returns dictionary with keys as headers and values as the text between headers. + + """ + + def __init__( + self, + *args: Any, + remove_hyperlinks: bool = True, + remove_images: bool = True, + max_tokens: int = 2048, + # remove_tables: bool = True, + **kwargs: Any, + ) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._remove_hyperlinks = remove_hyperlinks + self._remove_images = remove_images + self._max_tokens = max_tokens + # self._remove_tables = remove_tables + + def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], + current_text: str): + """Append to tups chunk.""" + num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text)) + if num_tokens > self._max_tokens: + chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)] + for chunk in chunks: + tups.append((current_header, chunk)) + else: + tups.append((current_header, current_text)) + return tups + + def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]: + """Convert a markdown file to a dictionary. + + The keys are the headers and the values are the text under each header. + + """ + markdown_tups: List[Tuple[Optional[str], str]] = [] + lines = markdown_text.split("\n") + + current_header = None + current_text = "" + + for line in lines: + header_match = re.match(r"^#+\s", line) + if header_match: + if current_header is not None: + if current_text == "" or None: + continue + markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text) + + current_header = line + current_text = "" + else: + current_text += line + "\n" + markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text) + + if current_header is not None: + # pass linting, assert keys are defined + markdown_tups = [ + (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) + for key, value in markdown_tups + ] + else: + markdown_tups = [ + (key, re.sub("\n", "", value)) for key, value in markdown_tups + ] + + return markdown_tups + + def remove_images(self, content: str) -> str: + """Get a dictionary of a markdown file from its path.""" + pattern = r"!{1}\[\[(.*)\]\]" + content = re.sub(pattern, "", content) + return content + + # def remove_tables(self, content: str) -> List[List[str]]: + # """Convert markdown tables to nested lists.""" + # table_rows_pattern = r"((\r?\n){2}|^)([^\r\n]*\|[^\r\n]*(\r?\n)?)+(?=(\r?\n){2}|$)" + # table_cells_pattern = r"([^\|\r\n]*)\|" + # + # table_rows = re.findall(table_rows_pattern, content, re.MULTILINE) + # table_lists = [] + # for row in table_rows: + # cells = re.findall(table_cells_pattern, row[2]) + # cells = [cell.strip() for cell in cells if cell.strip()] + # table_lists.append(cells) + # return str(table_lists) + + def remove_hyperlinks(self, content: str) -> str: + """Get a dictionary of a markdown file from its path.""" + pattern = r"\[(.*?)\]\((.*?)\)" + content = re.sub(pattern, r"\1", content) + return content + + def _init_parser(self) -> Dict: + """Initialize the parser with the config.""" + return {} + + def parse_tups( + self, filepath: Path, errors: str = "ignore" + ) -> List[Tuple[Optional[str], str]]: + """Parse file into tuples.""" + with open(filepath, "r") as f: + content = f.read() + if self._remove_hyperlinks: + content = self.remove_hyperlinks(content) + if self._remove_images: + content = self.remove_images(content) + # if self._remove_tables: + # content = self.remove_tables(content) + markdown_tups = self.markdown_to_tups(content) + return markdown_tups + + def parse_file( + self, filepath: Path, errors: str = "ignore" + ) -> Union[str, List[str]]: + """Parse file into string.""" + tups = self.parse_tups(filepath, errors=errors) + results = [] + # TODO: don't include headers right now + for header, value in tups: + if header is None: + results.append(value) + else: + results.append(f"\n\n{header}\n{value}") + return results diff --git a/parser/file/openapi3_parser.py b/parser/file/openapi3_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..3c5082fa278e46eabaacbcf9cc62eb53b6e66162 --- /dev/null +++ b/parser/file/openapi3_parser.py @@ -0,0 +1,51 @@ +from urllib.parse import urlparse + +from openapi_parser import parse + +try: + from application.parser.file.base_parser import BaseParser +except ModuleNotFoundError: + from base_parser import BaseParser + + +class OpenAPI3Parser(BaseParser): + def init_parser(self) -> None: + return super().init_parser() + + def get_base_urls(self, urls): + base_urls = [] + for i in urls: + parsed_url = urlparse(i) + base_url = parsed_url.scheme + "://" + parsed_url.netloc + if base_url not in base_urls: + base_urls.append(base_url) + return base_urls + + def get_info_from_paths(self, path): + info = "" + if path.operations: + for operation in path.operations: + info += ( + f"\n{operation.method.value}=" + f"{operation.responses[0].description}" + ) + return info + + def parse_file(self, file_path): + data = parse(file_path) + results = "" + base_urls = self.get_base_urls(link.url for link in data.servers) + base_urls = ",".join([base_url for base_url in base_urls]) + results += f"Base URL:{base_urls}\n" + i = 1 + for path in data.paths: + info = self.get_info_from_paths(path) + results += ( + f"Path{i}: {path.url}\n" + f"description: {path.description}\n" + f"parameters: {path.parameters}\nmethods: {info}\n" + ) + i += 1 + with open("results.txt", "w") as f: + f.write(results) + return results diff --git a/parser/file/rst_parser.py b/parser/file/rst_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..633ec844559f1f8aac25822fde58f573ab053684 --- /dev/null +++ b/parser/file/rst_parser.py @@ -0,0 +1,173 @@ +"""reStructuredText parser. + +Contains parser for md files. + +""" +import re +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +from application.parser.file.base_parser import BaseParser + + +class RstParser(BaseParser): + """reStructuredText parser. + + Extract text from .rst files. + Returns dictionary with keys as headers and values as the text between headers. + + """ + + def __init__( + self, + *args: Any, + remove_hyperlinks: bool = True, + remove_images: bool = True, + remove_table_excess: bool = True, + remove_interpreters: bool = True, + remove_directives: bool = True, + remove_whitespaces_excess: bool = True, + # Be careful with remove_characters_excess, might cause data loss + remove_characters_excess: bool = True, + **kwargs: Any, + ) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._remove_hyperlinks = remove_hyperlinks + self._remove_images = remove_images + self._remove_table_excess = remove_table_excess + self._remove_interpreters = remove_interpreters + self._remove_directives = remove_directives + self._remove_whitespaces_excess = remove_whitespaces_excess + self._remove_characters_excess = remove_characters_excess + + def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]: + """Convert a reStructuredText file to a dictionary. + + The keys are the headers and the values are the text under each header. + + """ + rst_tups: List[Tuple[Optional[str], str]] = [] + lines = rst_text.split("\n") + + current_header = None + current_text = "" + + for i, line in enumerate(lines): + header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line) + if header_match and i > 0 and ( + len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]): + if current_header is not None: + if current_text == "" or None: + continue + # removes the next heading from current Document + if current_text.endswith(lines[i - 1] + "\n"): + current_text = current_text[:len(current_text) - len(lines[i - 1] + "\n")] + rst_tups.append((current_header, current_text)) + + current_header = lines[i - 1] + current_text = "" + else: + current_text += line + "\n" + + rst_tups.append((current_header, current_text)) + + # TODO: Format for rst + # + # if current_header is not None: + # # pass linting, assert keys are defined + # rst_tups = [ + # (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) + # for key, value in rst_tups + # ] + # else: + # rst_tups = [ + # (key, re.sub("\n", "", value)) for key, value in rst_tups + # ] + + if current_header is None: + rst_tups = [ + (key, re.sub("\n", "", value)) for key, value in rst_tups + ] + return rst_tups + + def remove_images(self, content: str) -> str: + pattern = r"\.\. image:: (.*)" + content = re.sub(pattern, "", content) + return content + + def remove_hyperlinks(self, content: str) -> str: + pattern = r"`(.*?) <(.*?)>`_" + content = re.sub(pattern, r"\1", content) + return content + + def remove_directives(self, content: str) -> str: + """Removes reStructuredText Directives""" + pattern = r"`\.\.([^:]+)::" + content = re.sub(pattern, "", content) + return content + + def remove_interpreters(self, content: str) -> str: + """Removes reStructuredText Interpreted Text Roles""" + pattern = r":(\w+):" + content = re.sub(pattern, "", content) + return content + + def remove_table_excess(self, content: str) -> str: + """Pattern to remove grid table separators""" + pattern = r"^\+[-]+\+[-]+\+$" + content = re.sub(pattern, "", content, flags=re.MULTILINE) + return content + + def remove_whitespaces_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]: + """Pattern to match 2 or more consecutive whitespaces""" + pattern = r"\s{2,}" + content = [(key, re.sub(pattern, " ", value)) for key, value in content] + return content + + def remove_characters_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]: + """Pattern to match 2 or more consecutive characters""" + pattern = r"(\S)\1{2,}" + content = [(key, re.sub(pattern, r"\1\1\1", value, flags=re.MULTILINE)) for key, value in content] + return content + + def _init_parser(self) -> Dict: + """Initialize the parser with the config.""" + return {} + + def parse_tups( + self, filepath: Path, errors: str = "ignore" + ) -> List[Tuple[Optional[str], str]]: + """Parse file into tuples.""" + with open(filepath, "r") as f: + content = f.read() + if self._remove_hyperlinks: + content = self.remove_hyperlinks(content) + if self._remove_images: + content = self.remove_images(content) + if self._remove_table_excess: + content = self.remove_table_excess(content) + if self._remove_directives: + content = self.remove_directives(content) + if self._remove_interpreters: + content = self.remove_interpreters(content) + rst_tups = self.rst_to_tups(content) + if self._remove_whitespaces_excess: + rst_tups = self.remove_whitespaces_excess(rst_tups) + if self._remove_characters_excess: + rst_tups = self.remove_characters_excess(rst_tups) + return rst_tups + + def parse_file( + self, filepath: Path, errors: str = "ignore" + ) -> Union[str, List[str]]: + """Parse file into string.""" + tups = self.parse_tups(filepath, errors=errors) + results = [] + # TODO: don't include headers right now + for header, value in tups: + if header is None: + results.append(value) + else: + results.append(f"\n\n{header}\n{value}") + return results diff --git a/parser/file/tabular_parser.py b/parser/file/tabular_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..81355ae079819412c33e0ba7723b7ab9632fe885 --- /dev/null +++ b/parser/file/tabular_parser.py @@ -0,0 +1,115 @@ +"""Tabular parser. + +Contains parsers for tabular data files. + +""" +from pathlib import Path +from typing import Any, Dict, List, Union + +from application.parser.file.base_parser import BaseParser + + +class CSVParser(BaseParser): + """CSV parser. + + Args: + concat_rows (bool): whether to concatenate all rows into one document. + If set to False, a Document will be created for each row. + True by default. + + """ + + def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._concat_rows = concat_rows + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: + """Parse file. + + Returns: + Union[str, List[str]]: a string or a List of strings. + + """ + try: + import csv + except ImportError: + raise ValueError("csv module is required to read CSV files.") + text_list = [] + with open(file, "r") as fp: + csv_reader = csv.reader(fp) + for row in csv_reader: + text_list.append(", ".join(row)) + if self._concat_rows: + return "\n".join(text_list) + else: + return text_list + + +class PandasCSVParser(BaseParser): + r"""Pandas-based CSV parser. + + Parses CSVs using the separator detection from Pandas `read_csv`function. + If special parameters are required, use the `pandas_config` dict. + + Args: + concat_rows (bool): whether to concatenate all rows into one document. + If set to False, a Document will be created for each row. + True by default. + + col_joiner (str): Separator to use for joining cols per row. + Set to ", " by default. + + row_joiner (str): Separator to use for joining each row. + Only used when `concat_rows=True`. + Set to "\n" by default. + + pandas_config (dict): Options for the `pandas.read_csv` function call. + Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html + for more information. + Set to empty dict by default, this means pandas will try to figure + out the separators, table head, etc. on its own. + + """ + + def __init__( + self, + *args: Any, + concat_rows: bool = True, + col_joiner: str = ", ", + row_joiner: str = "\n", + pandas_config: dict = {}, + **kwargs: Any + ) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._concat_rows = concat_rows + self._col_joiner = col_joiner + self._row_joiner = row_joiner + self._pandas_config = pandas_config + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: + """Parse file.""" + try: + import pandas as pd + except ImportError: + raise ValueError("pandas module is required to read CSV files.") + + df = pd.read_csv(file, **self._pandas_config) + + text_list = df.apply( + lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 + ).tolist() + + if self._concat_rows: + return (self._row_joiner).join(text_list) + else: + return text_list diff --git a/parser/java2doc.py b/parser/java2doc.py new file mode 100644 index 0000000000000000000000000000000000000000..2a8bfa3a42208560dc54953e3cfbece249d2b92b --- /dev/null +++ b/parser/java2doc.py @@ -0,0 +1,66 @@ +import os + +import javalang + + +def find_files(directory): + files_list = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.java'): + files_list.append(os.path.join(root, file)) + return files_list + + +def extract_functions(file_path): + with open(file_path, "r") as file: + java_code = file.read() + methods = {} + tree = javalang.parse.parse(java_code) + for _, node in tree.filter(javalang.tree.MethodDeclaration): + method_name = node.name + start_line = node.position.line - 1 + end_line = start_line + brace_count = 0 + for line in java_code.splitlines()[start_line:]: + end_line += 1 + brace_count += line.count("{") - line.count("}") + if brace_count == 0: + break + method_source_code = "\n".join(java_code.splitlines()[start_line:end_line]) + methods[method_name] = method_source_code + return methods + + +def extract_classes(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + classes = {} + tree = javalang.parse.parse(source_code) + for class_decl in tree.types: + class_name = class_decl.name + declarations = [] + methods = [] + for field_decl in class_decl.fields: + field_name = field_decl.declarators[0].name + field_type = field_decl.type.name + declarations.append(f"{field_type} {field_name}") + for method_decl in class_decl.methods: + methods.append(method_decl.name) + class_string = "Declarations: " + ", ".join(declarations) + "\n Method name: " + ", ".join(methods) + classes[class_name] = class_string + return classes + + +def extract_functions_and_classes(directory): + files = find_files(directory) + functions_dict = {} + classes_dict = {} + for file in files: + functions = extract_functions(file) + if functions: + functions_dict[file] = functions + classes = extract_classes(file) + if classes: + classes_dict[file] = classes + return functions_dict, classes_dict diff --git a/parser/js2doc.py b/parser/js2doc.py new file mode 100644 index 0000000000000000000000000000000000000000..6dc448123afae253d19d6ce4e2ea5a0575121846 --- /dev/null +++ b/parser/js2doc.py @@ -0,0 +1,70 @@ +import os + +import escodegen +import esprima + + +def find_files(directory): + files_list = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.js'): + files_list.append(os.path.join(root, file)) + return files_list + + +def extract_functions(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + functions = {} + tree = esprima.parseScript(source_code) + for node in tree.body: + if node.type == 'FunctionDeclaration': + func_name = node.id.name if node.id else '' + functions[func_name] = escodegen.generate(node) + elif node.type == 'VariableDeclaration': + for declaration in node.declarations: + if declaration.init and declaration.init.type == 'FunctionExpression': + func_name = declaration.id.name if declaration.id else '' + functions[func_name] = escodegen.generate(declaration.init) + elif node.type == 'ClassDeclaration': + for subnode in node.body.body: + if subnode.type == 'MethodDefinition': + func_name = subnode.key.name + functions[func_name] = escodegen.generate(subnode.value) + elif subnode.type == 'VariableDeclaration': + for declaration in subnode.declarations: + if declaration.init and declaration.init.type == 'FunctionExpression': + func_name = declaration.id.name if declaration.id else '' + functions[func_name] = escodegen.generate(declaration.init) + return functions + + +def extract_classes(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + classes = {} + tree = esprima.parseScript(source_code) + for node in tree.body: + if node.type == 'ClassDeclaration': + class_name = node.id.name + function_names = [] + for subnode in node.body.body: + if subnode.type == 'MethodDefinition': + function_names.append(subnode.key.name) + classes[class_name] = ", ".join(function_names) + return classes + + +def extract_functions_and_classes(directory): + files = find_files(directory) + functions_dict = {} + classes_dict = {} + for file in files: + functions = extract_functions(file) + if functions: + functions_dict[file] = functions + classes = extract_classes(file) + if classes: + classes_dict[file] = classes + return functions_dict, classes_dict diff --git a/parser/open_ai_func.py b/parser/open_ai_func.py new file mode 100644 index 0000000000000000000000000000000000000000..ede635a857fbf33237372fe0c520efec157d7d11 --- /dev/null +++ b/parser/open_ai_func.py @@ -0,0 +1,94 @@ +import os + +import tiktoken +from application.vectorstore.vector_creator import VectorCreator +from application.core.settings import settings +from retry import retry + + +# from langchain.embeddings import HuggingFaceEmbeddings +# from langchain.embeddings import HuggingFaceInstructEmbeddings +# from langchain.embeddings import CohereEmbeddings + + +def num_tokens_from_string(string: str, encoding_name: str) -> int: + # Function to convert string to tokens and estimate user cost. + encoding = tiktoken.get_encoding(encoding_name) + num_tokens = len(encoding.encode(string)) + total_price = ((num_tokens / 1000) * 0.0004) + return num_tokens, total_price + + +@retry(tries=10, delay=60) +def store_add_texts_with_retry(store, i): + store.add_texts([i.page_content], metadatas=[i.metadata]) + # store_pine.add_texts([i.page_content], metadatas=[i.metadata]) + + +def call_openai_api(docs, folder_name, task_status): + # Function to create a vector store from the documents and save it to disk. + + # create output folder if it doesn't exist + if not os.path.exists(f"{folder_name}"): + os.makedirs(f"{folder_name}") + + from tqdm import tqdm + c1 = 0 + if settings.VECTOR_STORE == "faiss": + docs_init = [docs[0]] + docs.pop(0) + + store = VectorCreator.create_vectorstore( + settings.VECTOR_STORE, + docs_init = docs_init, + path=f"{folder_name}", + embeddings_key=os.getenv("EMBEDDINGS_KEY") + ) + else: + store = VectorCreator.create_vectorstore( + settings.VECTOR_STORE, + path=f"{folder_name}", + embeddings_key=os.getenv("EMBEDDINGS_KEY") + ) + # Uncomment for MPNet embeddings + # model_name = "sentence-transformers/all-mpnet-base-v2" + # hf = HuggingFaceEmbeddings(model_name=model_name) + # store = FAISS.from_documents(docs_test, hf) + s1 = len(docs) + for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), + bar_format='{l_bar}{bar}| Time Left: {remaining}'): + try: + task_status.update_state(state='PROGRESS', meta={'current': int((c1 / s1) * 100)}) + store_add_texts_with_retry(store, i) + except Exception as e: + print(e) + print("Error on ", i) + print("Saving progress") + print(f"stopped at {c1} out of {len(docs)}") + store.save_local(f"{folder_name}") + break + c1 += 1 + if settings.VECTOR_STORE == "faiss": + store.save_local(f"{folder_name}") + + +def get_user_permission(docs, folder_name): + # Function to ask user permission to call the OpenAI api and spend their OpenAI funds. + # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents. + # docs_content = (" ".join(docs)) + docs_content = "" + for doc in docs: + docs_content += doc.page_content + + tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base") + # Here we print the number of tokens and the approx user cost with some visually appealing formatting. + print(f"Number of Tokens = {format(tokens, ',d')}") + print(f"Approx Cost = ${format(total_price, ',.2f')}") + # Here we check for user permission before calling the API. + user_input = input("Price Okay? (Y/N) \n").lower() + if user_input == "y": + call_openai_api(docs, folder_name) + elif user_input == "": + call_openai_api(docs, folder_name) + else: + print("The API was not called. No money was spent.") diff --git a/parser/py2doc.py b/parser/py2doc.py new file mode 100644 index 0000000000000000000000000000000000000000..3a8175d4f84fa9c9a3f0859d14fba32446194878 --- /dev/null +++ b/parser/py2doc.py @@ -0,0 +1,121 @@ +import ast +import os +from pathlib import Path + +import tiktoken +from langchain.llms import OpenAI +from langchain.prompts import PromptTemplate + + +def find_files(directory): + files_list = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.py'): + files_list.append(os.path.join(root, file)) + return files_list + + +def extract_functions(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + functions = {} + tree = ast.parse(source_code) + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef): + func_name = node.name + func_def = ast.get_source_segment(source_code, node) + functions[func_name] = func_def + return functions + + +def extract_classes(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + classes = {} + tree = ast.parse(source_code) + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef): + class_name = node.name + function_names = [] + for subnode in ast.walk(node): + if isinstance(subnode, ast.FunctionDef): + function_names.append(subnode.name) + classes[class_name] = ", ".join(function_names) + return classes + + +def extract_functions_and_classes(directory): + files = find_files(directory) + functions_dict = {} + classes_dict = {} + for file in files: + functions = extract_functions(file) + if functions: + functions_dict[file] = functions + classes = extract_classes(file) + if classes: + classes_dict[file] = classes + return functions_dict, classes_dict + + +def parse_functions(functions_dict, formats, dir): + c1 = len(functions_dict) + for i, (source, functions) in enumerate(functions_dict.items(), start=1): + print(f"Processing file {i}/{c1}") + source_w = source.replace(dir + "/", "").replace("." + formats, ".md") + subfolders = "/".join(source_w.split("/")[:-1]) + Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True) + for j, (name, function) in enumerate(functions.items(), start=1): + print(f"Processing function {j}/{len(functions)}") + prompt = PromptTemplate( + input_variables=["code"], + template="Code: \n{code}, \nDocumentation: ", + ) + llm = OpenAI(temperature=0) + response = llm(prompt.format(code=function)) + mode = "a" if Path(f"outputs/{source_w}").exists() else "w" + with open(f"outputs/{source_w}", mode) as f: + f.write( + f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}") + + +def parse_classes(classes_dict, formats, dir): + c1 = len(classes_dict) + for i, (source, classes) in enumerate(classes_dict.items()): + print(f"Processing file {i + 1}/{c1}") + source_w = source.replace(dir + "/", "").replace("." + formats, ".md") + subfolders = "/".join(source_w.split("/")[:-1]) + Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True) + for name, function_names in classes.items(): + print(f"Processing Class {i + 1}/{c1}") + prompt = PromptTemplate( + input_variables=["class_name", "functions_names"], + template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ", + ) + llm = OpenAI(temperature=0) + response = llm(prompt.format(class_name=name, functions_names=function_names)) + + with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f: + f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}") + + +def transform_to_docs(functions_dict, classes_dict, formats, dir): + docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()]) + docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()]) + + num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(docs_content)) + total_price = ((num_tokens / 1000) * 0.02) + + print(f"Number of Tokens = {num_tokens:,d}") + print(f"Approx Cost = ${total_price:,.2f}") + + user_input = input("Price Okay? (Y/N)\n").lower() + if user_input == "y" or user_input == "": + if not Path("outputs").exists(): + Path("outputs").mkdir() + parse_functions(functions_dict, formats, dir) + parse_classes(classes_dict, formats, dir) + print("All done!") + else: + print("The API was not called. No money was spent.") diff --git a/parser/schema/__init__.py b/parser/schema/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/parser/schema/__init__.py @@ -0,0 +1 @@ + diff --git a/parser/schema/__pycache__/__init__.cpython-310.pyc b/parser/schema/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65128e835f4b729edc091ba37357de4b3696d768 Binary files /dev/null and b/parser/schema/__pycache__/__init__.cpython-310.pyc differ diff --git a/parser/schema/__pycache__/base.cpython-310.pyc b/parser/schema/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48f6c6deaa7a973b689477ad70579aee6f9b0189 Binary files /dev/null and b/parser/schema/__pycache__/base.cpython-310.pyc differ diff --git a/parser/schema/__pycache__/schema.cpython-310.pyc b/parser/schema/__pycache__/schema.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20c1685c5ac0ad4cb961337a53c844eb4a3fec56 Binary files /dev/null and b/parser/schema/__pycache__/schema.cpython-310.pyc differ diff --git a/parser/schema/base.py b/parser/schema/base.py new file mode 100644 index 0000000000000000000000000000000000000000..61670f9a6a28d18ca675ddb2402312e1ba6e031b --- /dev/null +++ b/parser/schema/base.py @@ -0,0 +1,34 @@ +"""Base schema for readers.""" +from dataclasses import dataclass + +from langchain.docstore.document import Document as LCDocument +from application.parser.schema.schema import BaseDocument + + +@dataclass +class Document(BaseDocument): + """Generic interface for a data document. + + This document connects to data sources. + + """ + + def __post_init__(self) -> None: + """Post init.""" + if self.text is None: + raise ValueError("text field not set.") + + @classmethod + def get_type(cls) -> str: + """Get Document type.""" + return "Document" + + def to_langchain_format(self) -> LCDocument: + """Convert struct to LangChain document format.""" + metadata = self.extra_info or {} + return LCDocument(page_content=self.text, metadata=metadata) + + @classmethod + def from_langchain_format(cls, doc: LCDocument) -> "Document": + """Convert struct from LangChain document format.""" + return cls(text=doc.page_content, extra_info=doc.metadata) diff --git a/parser/schema/schema.py b/parser/schema/schema.py new file mode 100644 index 0000000000000000000000000000000000000000..ec467e5a74f81e11b21a7965c94c1986708445d6 --- /dev/null +++ b/parser/schema/schema.py @@ -0,0 +1,64 @@ +"""Base schema for data structures.""" +from abc import abstractmethod +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +from dataclasses_json import DataClassJsonMixin + + +@dataclass +class BaseDocument(DataClassJsonMixin): + """Base document. + + Generic abstract interfaces that captures both index structs + as well as documents. + + """ + + # TODO: consolidate fields from Document/IndexStruct into base class + text: Optional[str] = None + doc_id: Optional[str] = None + embedding: Optional[List[float]] = None + + # extra fields + extra_info: Optional[Dict[str, Any]] = None + + @classmethod + @abstractmethod + def get_type(cls) -> str: + """Get Document type.""" + + def get_text(self) -> str: + """Get text.""" + if self.text is None: + raise ValueError("text field not set.") + return self.text + + def get_doc_id(self) -> str: + """Get doc_id.""" + if self.doc_id is None: + raise ValueError("doc_id not set.") + return self.doc_id + + @property + def is_doc_id_none(self) -> bool: + """Check if doc_id is None.""" + return self.doc_id is None + + def get_embedding(self) -> List[float]: + """Get embedding. + + Errors if embedding is None. + + """ + if self.embedding is None: + raise ValueError("embedding not set.") + return self.embedding + + @property + def extra_info_str(self) -> Optional[str]: + """Extra info string.""" + if self.extra_info is None: + return None + + return "\n".join([f"{k}: {str(v)}" for k, v in self.extra_info.items()]) diff --git a/parser/token_func.py b/parser/token_func.py new file mode 100644 index 0000000000000000000000000000000000000000..14b231fcda46b78b6d04c7757e6aa17ce5285b8c --- /dev/null +++ b/parser/token_func.py @@ -0,0 +1,77 @@ +import re +from math import ceil +from typing import List + +import tiktoken +from application.parser.schema.base import Document + + +def separate_header_and_body(text): + header_pattern = r"^(.*?\n){3}" + match = re.match(header_pattern, text) + header = match.group(0) + body = text[len(header):] + return header, body + + +def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]: + docs = [] + current_group = None + + for doc in documents: + doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) + + if current_group is None: + current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding, + extra_info=doc.extra_info) + elif len(tiktoken.get_encoding("cl100k_base").encode( + current_group.text)) + doc_len < max_tokens and doc_len < min_tokens: + current_group.text += " " + doc.text + else: + docs.append(current_group) + current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding, + extra_info=doc.extra_info) + + if current_group is not None: + docs.append(current_group) + + return docs + + +def split_documents(documents: List[Document], max_tokens: int) -> List[Document]: + docs = [] + for doc in documents: + token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) + if token_length <= max_tokens: + docs.append(doc) + else: + header, body = separate_header_and_body(doc.text) + if len(tiktoken.get_encoding("cl100k_base").encode(header)) > max_tokens: + body = doc.text + header = "" + num_body_parts = ceil(token_length / max_tokens) + part_length = ceil(len(body) / num_body_parts) + body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)] + for i, body_part in enumerate(body_parts): + new_doc = Document(text=header + body_part.strip(), + doc_id=f"{doc.doc_id}-{i}", + embedding=doc.embedding, + extra_info=doc.extra_info) + docs.append(new_doc) + return docs + + +def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True): + if not token_check: + return documents + print("Grouping small documents") + try: + documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens) + except Exception: + print("Grouping failed, try running without token_check") + print("Separating large documents") + try: + documents = split_documents(documents=documents, max_tokens=max_tokens) + except Exception: + print("Grouping failed, try running without token_check") + return documents diff --git a/prompts/chat_combine_creative.txt b/prompts/chat_combine_creative.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f9a61c9da8020d960e52b6fdea4643e5964cff7 --- /dev/null +++ b/prompts/chat_combine_creative.txt @@ -0,0 +1,9 @@ +You are a DocsGPT, friendly and helpful AI assistant by Arc53 that provides help with documents. You give thorough answers with code examples if possible. +Use the following pieces of context to help answer the users question. If its not relevant to the question, provide friendly responses. +You have access to chat history, and can use it to help answer the question. +When using code examples, use the following format: +```(language) +(code) +``` +---------------- +{summaries} \ No newline at end of file diff --git a/prompts/chat_combine_default.txt b/prompts/chat_combine_default.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdf0b2c25d7d59c9f7c99c2498c8b3cad375bd36 --- /dev/null +++ b/prompts/chat_combine_default.txt @@ -0,0 +1,9 @@ +You are a helpful AI assistant, DocsGPT, specializing in document assistance, designed to offer detailed and informative responses. +If appropriate, your answers can include code examples, formatted as follows: +```(language) +(code) +``` +You effectively utilize chat history, ensuring relevant and tailored responses. +If a question doesn't align with your context, you provide friendly and helpful replies. +---------------- +{summaries} \ No newline at end of file diff --git a/prompts/chat_combine_strict.txt b/prompts/chat_combine_strict.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6de46303318a1ca31e20624596f38b40754254d --- /dev/null +++ b/prompts/chat_combine_strict.txt @@ -0,0 +1,13 @@ +You are an AI Assistant, DocsGPT, adept at offering document assistance. +Your expertise lies in providing answer on top of provided context. +You can leverage the chat history if needed. +Answer the question based on the context below. +Keep the answer concise. Respond "Irrelevant context" if not sure about the answer. +If question is not related to the context, respond "Irrelevant context". +When using code examples, use the following format: +```(language) +(code) +``` + ---------------- + Context: + {summaries} \ No newline at end of file diff --git a/prompts/chat_reduce_prompt.txt b/prompts/chat_reduce_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5842d878d56ea2c9ba892e473aff74263e1781b --- /dev/null +++ b/prompts/chat_reduce_prompt.txt @@ -0,0 +1,3 @@ +Use the following pieces of context to help answer the users question. If its not relevant to the question, respond with "-" +---------------- +{context} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1bcb63331101e2bac426297bbb2ad0c829b20b2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,145 @@ +aiodns==3.1.1 +aiohttp==3.9.1 +aiohttp-retry==2.8.3 +aiosignal==1.3.1 +amqp==5.2.0 +annotated-types==0.6.0 +anthropic==0.8.0 +anyio==4.2.0 +async-timeout==4.0.3 +attrs==23.1.0 +billiard==4.2.0 +blinker==1.7.0 +blobfile==2.1.1 +boto3==1.34.6 +botocore==1.34.6 +celery==5.3.6 +certifi==2023.11.17 +cffi==1.16.0 +chardet==5.2.0 +charset-normalizer==3.3.2 +click==8.1.7 +click-didyoumean==0.3.0 +click-plugins==1.1.1 +click-repl==0.3.0 +cryptography==41.0.7 +dataclasses-json==0.6.3 +decorator==5.1.1 +dill==0.3.7 +distro==1.8.0 +dnspython==2.4.2 +docx2txt==0.8 +ecdsa==0.18.0 +elastic-transport==8.11.0 +elasticsearch==8.11.1 +entrypoints==0.4 +faiss-cpu==1.7.4 +filelock==3.13.1 +Flask==3.0.0 +Flask-Cors==4.0.0 +frozenlist==1.4.1 +fsspec==2023.12.2 +geojson==2.5.0 +greenlet==3.0.3 +gunicorn==21.2.0 +h11==0.14.0 +httpcore==1.0.2 +httpx==0.26.0 +huggingface-hub==0.20.1 +humbug==0.3.2 +idna==3.6 +iniconfig==2.0.0 +itsdangerous==2.1.2 +Jinja2==3.1.2 +jmespath==1.0.1 +joblib==1.3.2 +jsonpatch==1.33 +jsonpointer==2.4 +kombu==5.3.4 +langchain==0.0.352 +langchain-community==0.0.6 +langchain-core==0.1.3 +langsmith==0.0.74 +lazy-object-proxy==1.10.0 +loguru==0.7.2 +lxml==4.9.4 +MarkupSafe==2.1.3 +marshmallow==3.20.1 +marshmallow-enum==1.5.1 +mpmath==1.3.0 +multidict==6.0.4 +multiprocess==0.70.15 +mypy-extensions==1.0.0 +networkx==3.2.1 +nltk==3.8.1 +npx==0.1.1 +numcodecs==0.12.1 +numpy==1.26.2 +openai==1.6.1 +openapi-schema-validator==0.6.2 +openapi-spec-validator==0.6.0 +openapi3-parser==1.1.16 +packaging==23.2 +pathable==0.4.3 +pathos==0.3.1 +Pillow==10.1.0 +pluggy==1.3.0 +pox==0.3.3 +ppft==1.7.6.7 +prance==23.6.21.0 +prompt-toolkit==3.0.43 +pyasn1==0.5.1 +pycares==4.4.0 +pycparser==2.21 +pycryptodome==3.19.0 +pycryptodomex==3.19.0 +pydantic==2.5.3 +pydantic_core==2.14.6 +pydantic_settings==2.1.0 +PyJWT==2.8.0 +pymongo==4.6.1 +pyowm==3.3.0 +PyPDF2==3.0.1 +PySocks==1.7.1 +pytest==7.4.3 +python-dateutil==2.8.2 +python-dotenv==1.0.0 +python-jose==3.3.0 +python-liquid==1.10.2 +pytz==2023.3.post1 +PyYAML==6.0.1 +redis==5.0.1 +regex==2023.10.3 +requests==2.31.0 +retry==0.9.2 +rfc3339-validator==0.1.4 +rpds-py==0.15.2 +rsa==4.9 +ruamel.yaml==0.18.5 +ruamel.yaml.clib==0.2.8 +s3transfer==0.10.0 +safetensors==0.4.1 +scikit-learn==1.3.2 +scipy==1.11.4 +sentence-transformers +sentencepiece==0.1.99 +six==1.16.0 +sniffio==1.3.0 +SQLAlchemy==2.0.23 +sympy==1.12 +tenacity==8.2.3 +threadpoolctl==3.2.0 +tiktoken +tokenizers==0.15.0 +torch==2.1.2 +torchvision==0.16.2 +tqdm==4.66.1 +transformers==4.36.2 +typer==0.9.0 +typing-inspect==0.9.0 +typing_extensions==4.9.0 +tzdata==2023.3 +vine==5.1.0 +wcwidth==0.2.12 +Werkzeug==3.0.1 +yarl==1.9.4 diff --git a/vectorstore/__init__.py b/vectorstore/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vectorstore/__pycache__/__init__.cpython-310.pyc b/vectorstore/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dccc9409174f1a476def628165a550a276eb032d Binary files /dev/null and b/vectorstore/__pycache__/__init__.cpython-310.pyc differ diff --git a/vectorstore/__pycache__/base.cpython-310.pyc b/vectorstore/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e3eb4d7ea29c66903749c3a4880a2f431b171d6 Binary files /dev/null and b/vectorstore/__pycache__/base.cpython-310.pyc differ diff --git a/vectorstore/__pycache__/document_class.cpython-310.pyc b/vectorstore/__pycache__/document_class.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbff6ead5eea51b8eb90e3766fda5813c36645cf Binary files /dev/null and b/vectorstore/__pycache__/document_class.cpython-310.pyc differ diff --git a/vectorstore/__pycache__/elasticsearch.cpython-310.pyc b/vectorstore/__pycache__/elasticsearch.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..212c6c87e6c3e09f72f5d93caefac1b5a359db79 Binary files /dev/null and b/vectorstore/__pycache__/elasticsearch.cpython-310.pyc differ diff --git a/vectorstore/__pycache__/faiss.cpython-310.pyc b/vectorstore/__pycache__/faiss.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..089f4719c028edbaeba0266ff82a64c1bd938dd8 Binary files /dev/null and b/vectorstore/__pycache__/faiss.cpython-310.pyc differ diff --git a/vectorstore/__pycache__/mongodb.cpython-310.pyc b/vectorstore/__pycache__/mongodb.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d81e69065cafeb121a7117ba434ed4efab0f115b Binary files /dev/null and b/vectorstore/__pycache__/mongodb.cpython-310.pyc differ diff --git a/vectorstore/__pycache__/vector_creator.cpython-310.pyc b/vectorstore/__pycache__/vector_creator.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30f15e71989fcd9fffc10ee848875dd4bad2e1bb Binary files /dev/null and b/vectorstore/__pycache__/vector_creator.cpython-310.pyc differ diff --git a/vectorstore/base.py b/vectorstore/base.py new file mode 100644 index 0000000000000000000000000000000000000000..ffff49b6a262417bac4b0ae58b121b07abe1cd65 --- /dev/null +++ b/vectorstore/base.py @@ -0,0 +1,56 @@ +from abc import ABC, abstractmethod +import os +from langchain.embeddings import ( + OpenAIEmbeddings, + HuggingFaceEmbeddings, + CohereEmbeddings, + HuggingFaceInstructEmbeddings, +) +from application.core.settings import settings + +class BaseVectorStore(ABC): + def __init__(self): + pass + + @abstractmethod + def search(self, *args, **kwargs): + pass + + def is_azure_configured(self): + return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME + + def _get_embeddings(self, embeddings_name, embeddings_key=None): + embeddings_factory = { + "openai_text-embedding-ada-002": OpenAIEmbeddings, + "huggingface_sentence-transformers/all-mpnet-base-v2": HuggingFaceEmbeddings, + "huggingface_hkunlp/instructor-large": HuggingFaceInstructEmbeddings, + "cohere_medium": CohereEmbeddings + } + + if embeddings_name not in embeddings_factory: + raise ValueError(f"Invalid embeddings_name: {embeddings_name}") + + if embeddings_name == "openai_text-embedding-ada-002": + if self.is_azure_configured(): + os.environ["OPENAI_API_TYPE"] = "azure" + embedding_instance = embeddings_factory[embeddings_name]( + model=settings.AZURE_EMBEDDINGS_DEPLOYMENT_NAME + ) + else: + embedding_instance = embeddings_factory[embeddings_name]( + openai_api_key=embeddings_key + ) + elif embeddings_name == "cohere_medium": + embedding_instance = embeddings_factory[embeddings_name]( + cohere_api_key=embeddings_key + ) + elif embeddings_name == "huggingface_sentence-transformers/all-mpnet-base-v2": + embedding_instance = embeddings_factory[embeddings_name]( + model_name="./model/all-mpnet-base-v2", + model_kwargs={"device": "cpu"}, + ) + else: + embedding_instance = embeddings_factory[embeddings_name]() + + return embedding_instance + diff --git a/vectorstore/document_class.py b/vectorstore/document_class.py new file mode 100644 index 0000000000000000000000000000000000000000..30d70a560e210ecf71c5bc84fba7f9d8733afe95 --- /dev/null +++ b/vectorstore/document_class.py @@ -0,0 +1,8 @@ +class Document(str): + """Class for storing a piece of text and associated metadata.""" + + def __new__(cls, page_content: str, metadata: dict): + instance = super().__new__(cls, page_content) + instance.page_content = page_content + instance.metadata = metadata + return instance diff --git a/vectorstore/elasticsearch.py b/vectorstore/elasticsearch.py new file mode 100644 index 0000000000000000000000000000000000000000..bb28d5cef8bae067a7c09c6a3aea0ca054dec7b7 --- /dev/null +++ b/vectorstore/elasticsearch.py @@ -0,0 +1,213 @@ +from application.vectorstore.base import BaseVectorStore +from application.core.settings import settings +from application.vectorstore.document_class import Document +import elasticsearch + + + + +class ElasticsearchStore(BaseVectorStore): + _es_connection = None # Class attribute to hold the Elasticsearch connection + + def __init__(self, path, embeddings_key, index_name=settings.ELASTIC_INDEX): + super().__init__() + self.path = path.replace("application/indexes/", "").rstrip("/") + self.embeddings_key = embeddings_key + self.index_name = index_name + + if ElasticsearchStore._es_connection is None: + connection_params = {} + if settings.ELASTIC_URL: + connection_params["hosts"] = [settings.ELASTIC_URL] + connection_params["http_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD) + elif settings.ELASTIC_CLOUD_ID: + connection_params["cloud_id"] = settings.ELASTIC_CLOUD_ID + connection_params["basic_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD) + else: + raise ValueError("Please provide either elasticsearch_url or cloud_id.") + + + + ElasticsearchStore._es_connection = elasticsearch.Elasticsearch(**connection_params) + + self.docsearch = ElasticsearchStore._es_connection + + def connect_to_elasticsearch( + *, + es_url = None, + cloud_id = None, + api_key = None, + username = None, + password = None, + ): + try: + import elasticsearch + except ImportError: + raise ImportError( + "Could not import elasticsearch python package. " + "Please install it with `pip install elasticsearch`." + ) + + if es_url and cloud_id: + raise ValueError( + "Both es_url and cloud_id are defined. Please provide only one." + ) + + connection_params = {} + + if es_url: + connection_params["hosts"] = [es_url] + elif cloud_id: + connection_params["cloud_id"] = cloud_id + else: + raise ValueError("Please provide either elasticsearch_url or cloud_id.") + + if api_key: + connection_params["api_key"] = api_key + elif username and password: + connection_params["basic_auth"] = (username, password) + + es_client = elasticsearch.Elasticsearch( + **connection_params, + ) + try: + es_client.info() + except Exception as e: + raise e + + return es_client + + def search(self, question, k=2, index_name=settings.ELASTIC_INDEX, *args, **kwargs): + embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key) + vector = embeddings.embed_query(question) + knn = { + "filter": [{"match": {"metadata.store.keyword": self.path}}], + "field": "vector", + "k": k, + "num_candidates": 100, + "query_vector": vector, + } + full_query = { + "knn": knn, + "query": { + "bool": { + "must": [ + { + "match": { + "text": { + "query": question, + } + } + } + ], + "filter": [{"match": {"metadata.store.keyword": self.path}}], + } + }, + "rank": {"rrf": {}}, + } + resp = self.docsearch.search(index=self.index_name, query=full_query['query'], size=k, knn=full_query['knn']) + # create Documents objects from the results page_content ['_source']['text'], metadata ['_source']['metadata'] + doc_list = [] + for hit in resp['hits']['hits']: + + doc_list.append(Document(page_content = hit['_source']['text'], metadata = hit['_source']['metadata'])) + return doc_list + + def _create_index_if_not_exists( + self, index_name, dims_length + ): + + if self._es_connection.indices.exists(index=index_name): + print(f"Index {index_name} already exists.") + + else: + + indexSettings = self.index( + dims_length=dims_length, + ) + self._es_connection.indices.create(index=index_name, **indexSettings) + + def index( + self, + dims_length, + ): + return { + "mappings": { + "properties": { + "vector": { + "type": "dense_vector", + "dims": dims_length, + "index": True, + "similarity": "cosine", + }, + } + } + } + + def add_texts( + self, + texts, + metadatas = None, + ids = None, + refresh_indices = True, + create_index_if_not_exists = True, + bulk_kwargs = None, + **kwargs, + ): + + from elasticsearch.helpers import BulkIndexError, bulk + + bulk_kwargs = bulk_kwargs or {} + import uuid + embeddings = [] + ids = ids or [str(uuid.uuid4()) for _ in texts] + requests = [] + embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key) + + vectors = embeddings.embed_documents(list(texts)) + + dims_length = len(vectors[0]) + + if create_index_if_not_exists: + self._create_index_if_not_exists( + index_name=self.index_name, dims_length=dims_length + ) + + for i, (text, vector) in enumerate(zip(texts, vectors)): + metadata = metadatas[i] if metadatas else {} + + requests.append( + { + "_op_type": "index", + "_index": self.index_name, + "text": text, + "vector": vector, + "metadata": metadata, + "_id": ids[i], + } + ) + + + if len(requests) > 0: + try: + success, failed = bulk( + self._es_connection, + requests, + stats_only=True, + refresh=refresh_indices, + **bulk_kwargs, + ) + return ids + except BulkIndexError as e: + print(f"Error adding texts: {e}") + firstError = e.errors[0].get("index", {}).get("error", {}) + print(f"First error reason: {firstError.get('reason')}") + raise e + + else: + return [] + + def delete_index(self): + self._es_connection.delete_by_query(index=self.index_name, query={"match": { + "metadata.store.keyword": self.path}},) + diff --git a/vectorstore/faiss.py b/vectorstore/faiss.py new file mode 100644 index 0000000000000000000000000000000000000000..3a0a7b823bbc731dc7f3fa521811c641d335d578 --- /dev/null +++ b/vectorstore/faiss.py @@ -0,0 +1,46 @@ +from langchain.vectorstores import FAISS +from application.vectorstore.base import BaseVectorStore +from application.core.settings import settings + +class FaissStore(BaseVectorStore): + + def __init__(self, path, embeddings_key, docs_init=None): + super().__init__() + self.path = path + embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) + if docs_init: + self.docsearch = FAISS.from_documents( + docs_init, embeddings + ) + else: + self.docsearch = FAISS.load_local( + self.path, embeddings + ) + self.assert_embedding_dimensions(embeddings) + + def search(self, *args, **kwargs): + return self.docsearch.similarity_search(*args, **kwargs) + + def add_texts(self, *args, **kwargs): + return self.docsearch.add_texts(*args, **kwargs) + + def save_local(self, *args, **kwargs): + return self.docsearch.save_local(*args, **kwargs) + + def delete_index(self, *args, **kwargs): + return self.docsearch.delete(*args, **kwargs) + + def assert_embedding_dimensions(self, embeddings): + """ + Check that the word embedding dimension of the docsearch index matches + the dimension of the word embeddings used + """ + if settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2": + try: + word_embedding_dimension = embeddings.client[1].word_embedding_dimension + except AttributeError as e: + raise AttributeError("word_embedding_dimension not found in embeddings.client[1]") from e + docsearch_index_dimension = self.docsearch.index.d + if word_embedding_dimension != docsearch_index_dimension: + raise ValueError(f"word_embedding_dimension ({word_embedding_dimension}) " + + f"!= docsearch_index_word_embedding_dimension ({docsearch_index_dimension})") \ No newline at end of file diff --git a/vectorstore/mongodb.py b/vectorstore/mongodb.py new file mode 100644 index 0000000000000000000000000000000000000000..337fc41fe3c79eb59409046095cb96fe177216a7 --- /dev/null +++ b/vectorstore/mongodb.py @@ -0,0 +1,126 @@ +from application.vectorstore.base import BaseVectorStore +from application.core.settings import settings +from application.vectorstore.document_class import Document + +class MongoDBVectorStore(BaseVectorStore): + def __init__( + self, + path: str = "", + embeddings_key: str = "embeddings", + collection: str = "documents", + index_name: str = "vector_search_index", + text_key: str = "text", + embedding_key: str = "embedding", + database: str = "docsgpt", + ): + self._index_name = index_name + self._text_key = text_key + self._embedding_key = embedding_key + self._embeddings_key = embeddings_key + self._mongo_uri = settings.MONGO_URI + self._path = path.replace("application/indexes/", "").rstrip("/") + self._embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) + + try: + import pymongo + except ImportError: + raise ImportError( + "Could not import pymongo python package. " + "Please install it with `pip install pymongo`." + ) + + self._client = pymongo.MongoClient(self._mongo_uri) + self._database = self._client[database] + self._collection = self._database[collection] + + + def search(self, question, k=2, *args, **kwargs): + query_vector = self._embedding.embed_query(question) + + pipeline = [ + { + "$vectorSearch": { + "queryVector": query_vector, + "path": self._embedding_key, + "limit": k, + "numCandidates": k * 10, + "index": self._index_name, + "filter": { + "store": {"$eq": self._path} + } + } + } + ] + + cursor = self._collection.aggregate(pipeline) + + results = [] + for doc in cursor: + text = doc[self._text_key] + doc.pop("_id") + doc.pop(self._text_key) + doc.pop(self._embedding_key) + metadata = doc + results.append(Document(text, metadata)) + return results + + def _insert_texts(self, texts, metadatas): + if not texts: + return [] + embeddings = self._embedding.embed_documents(texts) + to_insert = [ + {self._text_key: t, self._embedding_key: embedding, **m} + for t, m, embedding in zip(texts, metadatas, embeddings) + ] + # insert the documents in MongoDB Atlas + insert_result = self._collection.insert_many(to_insert) + return insert_result.inserted_ids + + def add_texts(self, + texts, + metadatas = None, + ids = None, + refresh_indices = True, + create_index_if_not_exists = True, + bulk_kwargs = None, + **kwargs,): + + + #dims = self._embedding.client[1].word_embedding_dimension + # # check if index exists + # if create_index_if_not_exists: + # # check if index exists + # info = self._collection.index_information() + # if self._index_name not in info: + # index_mongo = { + # "fields": [{ + # "type": "vector", + # "path": self._embedding_key, + # "numDimensions": dims, + # "similarity": "cosine", + # }, + # { + # "type": "filter", + # "path": "store" + # }] + # } + # self._collection.create_index(self._index_name, index_mongo) + + batch_size = 100 + _metadatas = metadatas or ({} for _ in texts) + texts_batch = [] + metadatas_batch = [] + result_ids = [] + for i, (text, metadata) in enumerate(zip(texts, _metadatas)): + texts_batch.append(text) + metadatas_batch.append(metadata) + if (i + 1) % batch_size == 0: + result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) + texts_batch = [] + metadatas_batch = [] + if texts_batch: + result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) + return result_ids + + def delete_index(self, *args, **kwargs): + self._collection.delete_many({"store": self._path}) \ No newline at end of file diff --git a/vectorstore/vector_creator.py b/vectorstore/vector_creator.py new file mode 100644 index 0000000000000000000000000000000000000000..68ae281364e55adc5c9a394c608b4c028c4ce92b --- /dev/null +++ b/vectorstore/vector_creator.py @@ -0,0 +1,18 @@ +from application.vectorstore.faiss import FaissStore +from application.vectorstore.elasticsearch import ElasticsearchStore +from application.vectorstore.mongodb import MongoDBVectorStore + + +class VectorCreator: + vectorstores = { + 'faiss': FaissStore, + 'elasticsearch':ElasticsearchStore, + 'mongodb': MongoDBVectorStore, + } + + @classmethod + def create_vectorstore(cls, type, *args, **kwargs): + vectorstore_class = cls.vectorstores.get(type.lower()) + if not vectorstore_class: + raise ValueError(f"No vectorstore class found for type {type}") + return vectorstore_class(*args, **kwargs) \ No newline at end of file diff --git a/worker.py b/worker.py new file mode 100644 index 0000000000000000000000000000000000000000..ae8f240c452cf3ba18c83c2340ab022ca02e781f --- /dev/null +++ b/worker.py @@ -0,0 +1,123 @@ +import os +import shutil +import string +import zipfile +from urllib.parse import urljoin + +import nltk +import requests + +from application.core.settings import settings +from application.parser.file.bulk import SimpleDirectoryReader +from application.parser.open_ai_func import call_openai_api +from application.parser.schema.base import Document +from application.parser.token_func import group_split + +try: + nltk.download('punkt', quiet=True) + nltk.download('averaged_perceptron_tagger', quiet=True) +except FileExistsError: + pass + + +# Define a function to extract metadata from a given filename. +def metadata_from_filename(title): + store = '/'.join(title.split('/')[1:3]) + return {'title': title, 'store': store} + + +# Define a function to generate a random string of a given length. +def generate_random_string(length): + return ''.join([string.ascii_letters[i % 52] for i in range(length)]) + +current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Define the main function for ingesting and processing documents. +def ingest_worker(self, directory, formats, name_job, filename, user): + """ + Ingest and process documents. + + Args: + self: Reference to the instance of the task. + directory (str): Specifies the directory for ingesting ('inputs' or 'temp'). + formats (list of str): List of file extensions to consider for ingestion (e.g., [".rst", ".md"]). + name_job (str): Name of the job for this ingestion task. + filename (str): Name of the file to be ingested. + user (str): Identifier for the user initiating the ingestion. + + Returns: + dict: Information about the completed ingestion task, including input parameters and a "limited" flag. + """ + # directory = 'inputs' or 'temp' + # formats = [".rst", ".md"] + input_files = None + recursive = True + limit = None + exclude = True + # name_job = 'job1' + # filename = 'install.rst' + # user = 'local' + sample = False + token_check = True + min_tokens = 150 + max_tokens = 1250 + full_path = directory + '/' + user + '/' + name_job + import sys + print(full_path, file=sys.stderr) + # check if API_URL env variable is set + file_data = {'name': name_job, 'file': filename, 'user': user} + response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data) + # check if file is in the response + print(response, file=sys.stderr) + file = response.content + + if not os.path.exists(full_path): + os.makedirs(full_path) + with open(full_path + '/' + filename, 'wb') as f: + f.write(file) + + # check if file is .zip and extract it + if filename.endswith('.zip'): + with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref: + zip_ref.extractall(full_path) + os.remove(full_path + '/' + filename) + + self.update_state(state='PROGRESS', meta={'current': 1}) + + raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive, + required_exts=formats, num_files_limit=limit, + exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data() + raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) + + docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + + call_openai_api(docs, full_path, self) + self.update_state(state='PROGRESS', meta={'current': 100}) + + if sample: + for i in range(min(5, len(raw_docs))): + print(raw_docs[i].text) + + # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl + # and send them to the server (provide user and name in form) + file_data = {'name': name_job, 'user': user} + if settings.VECTOR_STORE == "faiss": + files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), + 'file_pkl': open(full_path + '/index.pkl', 'rb')} + response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data) + response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)) + else: + response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data) + + + # delete local + shutil.rmtree(full_path) + + return { + 'directory': directory, + 'formats': formats, + 'name_job': name_job, + 'filename': filename, + 'user': user, + 'limited': False + } diff --git a/wsgi.py b/wsgi.py new file mode 100644 index 0000000000000000000000000000000000000000..5160e115ecbf009f5762714715b2cc1ce8c865d4 --- /dev/null +++ b/wsgi.py @@ -0,0 +1,4 @@ +from application.app import app + +if __name__ == "__main__": + app.run(debug=True, port=7091)