Spaces:
Runtime error
Runtime error
HemanthSai7
commited on
Commit
•
7a89bde
0
Parent(s):
HF spaces
Browse files- .gitattributes +41 -0
- .github/workflows/checkfilesize.yaml +16 -0
- .github/workflows/githubtohfsync.yaml +20 -0
- .gitignore +137 -0
- Dockerfile +16 -0
- README.md +56 -0
- StudybotAPI/app.py +1 -0
- StudybotAPI/backend/__init__.py +40 -0
- StudybotAPI/backend/core/ExceptionHandlers.py +27 -0
- StudybotAPI/backend/core/Exceptions.py +48 -0
- StudybotAPI/backend/core/__init__.py +0 -0
- StudybotAPI/backend/core/configEnv.py +24 -0
- StudybotAPI/backend/ingestion/__init__.py +3 -0
- StudybotAPI/backend/ingestion/embeddings.py +55 -0
- StudybotAPI/backend/ingestion/prepare_data.py +79 -0
- StudybotAPI/backend/ingestion/preprocess_data.py +35 -0
- StudybotAPI/backend/retriever/__init__.py +2 -0
- StudybotAPI/backend/retriever/ops.py +25 -0
- StudybotAPI/backend/retriever/pipeline.py +72 -0
- StudybotAPI/backend/routes.py +65 -0
- StudybotAPI/backend/schemas/FrontendResponseSchema.py +12 -0
- StudybotAPI/backend/schemas/__init__.py +2 -0
- StudybotAPI/backend/schemas/chatschema.py +13 -0
- StudybotAPI/backend/utils/__init__.py +1 -0
- StudybotAPI/backend/utils/chain_loader.py +40 -0
- StudybotAPI/backend/utils/prompt.txt +12 -0
- StudybotAPI/backend/utils/prompts.py +62 -0
- StudybotAPI/config.yml +10 -0
- StudybotAPI/requirements.txt +15 -0
- frontend/components/__init__.py +2 -0
- frontend/components/authors.py +13 -0
- frontend/components/user_greetings.py +6 -0
- frontend/layouts/__init__.py +0 -0
- frontend/layouts/mainlayout.py +19 -0
- frontend/layouts/st_page_layouts.json +14 -0
- frontend/pages/2_🤖_bot.py +9 -0
- frontend/requirements.txt +1 -0
- frontend/test.py +26 -0
- frontend/🏡_Home.py +65 -0
- notebooks/Untitled6.ipynb +0 -0
- notebooks/embeddings.ipynb +0 -0
- notebooks/selfrag.ipynb +0 -0
- test.py +26 -0
.gitattributes
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.png
|
37 |
+
.png
|
38 |
+
frontend/images/architecture.png filter=lfs diff=lfs merge=lfs -text
|
39 |
+
StudybotAPI/assets/*.png filter=lfs diff=lfs merge=lfs -text
|
40 |
+
StudybotAPI/backend/data/History_1.pdf filter=lfs diff=lfs merge=lfs -text
|
41 |
+
|
.github/workflows/checkfilesize.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Check file size
|
2 |
+
on: # or directly `on: [push]` to run the action on every push on any branch
|
3 |
+
pull_request:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- name: Check large files
|
14 |
+
uses: ActionsDesk/lfs-warning@v2.0
|
15 |
+
with:
|
16 |
+
filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
|
.github/workflows/githubtohfsync.yaml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v3
|
14 |
+
with:
|
15 |
+
fetch-depth: 0
|
16 |
+
lfs: true
|
17 |
+
- name: Push to hub
|
18 |
+
env:
|
19 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
20 |
+
run: git push https://HemanthSai7:$HF_TOKEN@huggingface.co/spaces/HemanthSai7/StudybotAPI main
|
.gitignore
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
marvin.toml
|
2 |
+
.marvin-history
|
3 |
+
marvin.egg-info
|
4 |
+
|
5 |
+
# Byte-compiled / optimized / DLL files
|
6 |
+
__pycache__/
|
7 |
+
*.py[cod]
|
8 |
+
*$py.class
|
9 |
+
|
10 |
+
# C extensions
|
11 |
+
*.so
|
12 |
+
|
13 |
+
# Distribution / packaging
|
14 |
+
.Python
|
15 |
+
build/
|
16 |
+
develop-eggs/
|
17 |
+
# dist/
|
18 |
+
downloads/
|
19 |
+
eggs/
|
20 |
+
.eggs/
|
21 |
+
lib/
|
22 |
+
lib64/
|
23 |
+
parts/
|
24 |
+
sdist/
|
25 |
+
var/
|
26 |
+
wheels/
|
27 |
+
pip-wheel-metadata/
|
28 |
+
share/python-wheels/
|
29 |
+
*.egg-info/
|
30 |
+
.installed.cfg
|
31 |
+
*.egg
|
32 |
+
MANIFEST
|
33 |
+
|
34 |
+
# PyInstaller
|
35 |
+
# Usually these files are written by a python script from a template
|
36 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
37 |
+
*.manifest
|
38 |
+
*.spec
|
39 |
+
|
40 |
+
# Installer logs
|
41 |
+
pip-log.txt
|
42 |
+
pip-delete-this-directory.txt
|
43 |
+
|
44 |
+
# Unit test / coverage reports
|
45 |
+
htmlcov/
|
46 |
+
.tox/
|
47 |
+
.nox/
|
48 |
+
.coverage
|
49 |
+
.coverage.*
|
50 |
+
.cache
|
51 |
+
nosetests.xml
|
52 |
+
coverage.xml
|
53 |
+
*.cover
|
54 |
+
*.py,cover
|
55 |
+
.hypothesis/
|
56 |
+
.pytest_cache/
|
57 |
+
|
58 |
+
# Translations
|
59 |
+
*.mo
|
60 |
+
*.pot
|
61 |
+
|
62 |
+
# Django stuff:
|
63 |
+
*.log
|
64 |
+
local_settings.py
|
65 |
+
db.sqlite3
|
66 |
+
db.sqlite3-journal
|
67 |
+
|
68 |
+
# Flask stuff:
|
69 |
+
instance/
|
70 |
+
.webassets-cache
|
71 |
+
|
72 |
+
# vector store
|
73 |
+
studybot/vectorstore/
|
74 |
+
|
75 |
+
# Scrapy stuff:
|
76 |
+
.scrapy
|
77 |
+
|
78 |
+
# Sphinx documentation
|
79 |
+
docs/_build/
|
80 |
+
|
81 |
+
# PyBuilder
|
82 |
+
target/
|
83 |
+
|
84 |
+
# Jupyter Notebook
|
85 |
+
.ipynb_checkpoints
|
86 |
+
|
87 |
+
# IPython
|
88 |
+
profile_default/
|
89 |
+
ipython_config.py
|
90 |
+
|
91 |
+
# pyenv
|
92 |
+
.python-version
|
93 |
+
|
94 |
+
# pipenv
|
95 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
96 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
97 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
98 |
+
# install all needed dependencies.
|
99 |
+
#Pipfile.lock
|
100 |
+
|
101 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
102 |
+
__pypackages__/
|
103 |
+
|
104 |
+
# Celery stuff
|
105 |
+
celerybeat-schedule
|
106 |
+
celerybeat.pid
|
107 |
+
|
108 |
+
# SageMath parsed files
|
109 |
+
*.sage.py
|
110 |
+
|
111 |
+
# Environments
|
112 |
+
.env
|
113 |
+
.venv
|
114 |
+
env/
|
115 |
+
venv/
|
116 |
+
ENV/
|
117 |
+
env.bak/
|
118 |
+
venv.bak/
|
119 |
+
cpcli-env/
|
120 |
+
|
121 |
+
# Spyder project settings
|
122 |
+
.spyderproject
|
123 |
+
.spyproject
|
124 |
+
|
125 |
+
# Rope project settings
|
126 |
+
.ropeproject
|
127 |
+
|
128 |
+
# mkdocs documentation
|
129 |
+
/site
|
130 |
+
|
131 |
+
# mypy
|
132 |
+
.mypy_cache/
|
133 |
+
.dmypy.json
|
134 |
+
dmypy.json
|
135 |
+
|
136 |
+
# Pyre type checker
|
137 |
+
.pyre/
|
Dockerfile
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10.9
|
2 |
+
|
3 |
+
RUN useradd -m -u 1000 user
|
4 |
+
USER user
|
5 |
+
ENV HOME=/home/user \
|
6 |
+
PATH=/home/user/.local/bin:$PATH
|
7 |
+
|
8 |
+
COPY --chown=user ./StudybotAPI $HOME/StudybotAPI
|
9 |
+
|
10 |
+
WORKDIR $HOME/StudybotAPI
|
11 |
+
|
12 |
+
RUN mkdir $HOME/.cache
|
13 |
+
|
14 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
15 |
+
|
16 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: StudybotAPI
|
3 |
+
emoji: 😻
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: blue
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
---
|
9 |
+
|
10 |
+
# StudyBot x Streamlit
|
11 |
+
|
12 |
+
✨ Streamlit is an open-source app framework for Machine Learning and Data Science teams. Create beautiful data apps in hours, not weeks. All in pure Python. ✨
|
13 |
+
|
14 |
+
## Installation
|
15 |
+
|
16 |
+
```bash
|
17 |
+
pip install -r requirements.txt
|
18 |
+
```
|
19 |
+
|
20 |
+
## Start development server
|
21 |
+
|
22 |
+
> If you're using the default template, **remember to set the OpenAI API key** in `main.py`.
|
23 |
+
|
24 |
+
Run the following command:
|
25 |
+
|
26 |
+
```bash
|
27 |
+
cd StudybotAPI
|
28 |
+
uvicorn app:app --reload
|
29 |
+
```
|
30 |
+
|
31 |
+
Now go to [http://localhost:4000](http://localhost:4000) and start chatting with your bot! The bot will automatically reload when you change the code.
|
32 |
+
|
33 |
+
## Motive
|
34 |
+
When studying a theoretical subject, which has a lot of concepts, dates, important events etc. No matter how hard we try to momrize them, its hard to remember them all. So, I thought of making a bot which can help us in quick revision of the subject. For example, if we are studying history, and we forget the event of 1857 revolt, we can ask the bot like **"What happened in 1857?"** and it will tell us brief answer. This will help us in quick revision of the subject.
|
35 |
+
|
36 |
+
## How to use
|
37 |
+
Input the prompt in the text box and press enter. The bot will give you the answer. If you want to ask another question, just enter the question and the bot will try to answer.
|
38 |
+
|
39 |
+
## Screenshots
|
40 |
+
![image](StudybotAPI/assets/ss1.png)
|
41 |
+
![image](StudybotAPI/assets/ss2.png)
|
42 |
+
![image](StudybotAPI/assets/ss3.png)
|
43 |
+
|
44 |
+
## How it works
|
45 |
+
![image](StudybotAPI/assets/flowchart.png)
|
46 |
+
|
47 |
+
## Tech Stack
|
48 |
+
![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)
|
49 |
+
![FastAPI](https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi)
|
50 |
+
![JavaScript](https://img.shields.io/badge/javascript-%23323330.svg?style=for-the-badge&logo=javascript&logoColor=%23F7DF1E)
|
51 |
+
![HTML5](https://img.shields.io/badge/html5-%23E34F26.svg?style=for-the-badge&logo=html5&logoColor=white)
|
52 |
+
![GitHub Actions](https://img.shields.io/badge/github%20actions-%232671E5.svg?style=for-the-badge&logo=githubactions&logoColor=white)
|
53 |
+
![Langchain](https://img.shields.io/badge/langchain-%23E34F26.svg?style=for-the-badge&logo=langchains&logoColor=white)
|
54 |
+
![Huggingface](https://img.shields.io/badge/huggingface-%23E34F26.svg?style=for-the-badge&logo=huggingface&logoColor=white)
|
55 |
+
![Streamlit](https://img.shields.io/badge/streamlit-%23E34F26.svg?style=for-the-badge&logo=streamlit&logoColor=white)
|
56 |
+
![Docker](https://img.shields.io/badge/docker-%23E34F26.svg?style=for-the-badge&logo=docker&logoColor=white)
|
StudybotAPI/app.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from backend import app
|
StudybotAPI/backend/__init__.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import box
|
3 |
+
import yaml
|
4 |
+
|
5 |
+
from fastapi import FastAPI
|
6 |
+
|
7 |
+
from backend.ingestion import *
|
8 |
+
|
9 |
+
# from langchain.llms.huggingface_pipeline import HuggingFacePipeline
|
10 |
+
|
11 |
+
|
12 |
+
app = FastAPI(title="StudyBot API", version="0.1.0", description="API for StudyBot Project")
|
13 |
+
|
14 |
+
from backend import routes
|
15 |
+
# from backend.retriever import EmbeddingModel
|
16 |
+
|
17 |
+
|
18 |
+
try:
|
19 |
+
os.environ["TRANSFORMERS_CACHE"] = "/.cache"
|
20 |
+
|
21 |
+
with open("config.yml", "r", encoding="utf8") as ymlfile:
|
22 |
+
cfg = box.Box(yaml.safe_load(ymlfile))
|
23 |
+
app.state.emb = Embeddings(cfg)
|
24 |
+
|
25 |
+
# llm = HuggingFacePipeline(pipeline=EmbeddingModel._initialize_pipeline())
|
26 |
+
# llm = LlamaCpp(
|
27 |
+
# streaming=True,
|
28 |
+
# model_path="models/mistral-7b-instruct-v0.1.Q4_K_M.gguf",
|
29 |
+
# max_tokens=1500,
|
30 |
+
# temperature=0.4,
|
31 |
+
# top_p=1,
|
32 |
+
# gpu_layers=0,
|
33 |
+
# stream=True,
|
34 |
+
# verbose=False,
|
35 |
+
# n_threads=int(os.cpu_count() / 2),
|
36 |
+
# n_ctx=4096
|
37 |
+
# )
|
38 |
+
|
39 |
+
except Exception as e:
|
40 |
+
print(e)
|
StudybotAPI/backend/core/ExceptionHandlers.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from backend import app
|
2 |
+
from .Exceptions import *
|
3 |
+
|
4 |
+
from fastapi.responses import JSONResponse
|
5 |
+
from fastapi.requests import Request
|
6 |
+
from fastapi import status
|
7 |
+
|
8 |
+
@app.exception_handler(ModelDeploymentException)
|
9 |
+
async def model_deploying_exception_handler(request: Request, exc: ModelDeploymentException):
|
10 |
+
return JSONResponse(
|
11 |
+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
12 |
+
content=repr(exc)
|
13 |
+
)
|
14 |
+
|
15 |
+
@app.exception_handler(InfoNotProvidedException)
|
16 |
+
async def info_not_provided_exception_handler(request: Request, exc: InfoNotProvidedException):
|
17 |
+
return JSONResponse(
|
18 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
19 |
+
content=repr(exc)
|
20 |
+
)
|
21 |
+
|
22 |
+
@app.exception_handler(DataNotUploadedException)
|
23 |
+
async def data_not_uploaded_exception_handler(request: Request, exc: DataNotUploadedException):
|
24 |
+
return JSONResponse(
|
25 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
26 |
+
content=repr(exc)
|
27 |
+
)
|
StudybotAPI/backend/core/Exceptions.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from backend.schemas import FrontendResponseModel
|
2 |
+
|
3 |
+
|
4 |
+
class ModelDeploymentException(Exception):
|
5 |
+
def __init__(self, response_result: FrontendResponseModel):
|
6 |
+
self.response_result = response_result
|
7 |
+
self.set_status()
|
8 |
+
super(ModelDeploymentException, self).__init__()
|
9 |
+
|
10 |
+
def set_status(self):
|
11 |
+
self.response_result["status"] = "Error"
|
12 |
+
self.response_result["message"][0]="Model is deploying. Please try again later."
|
13 |
+
|
14 |
+
def __repr__(self):
|
15 |
+
return f"exception.ModelDeployingException()"
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
class InfoNotProvidedException(Exception):
|
20 |
+
def __init__(self, response_result: FrontendResponseModel, message: str):
|
21 |
+
self.response_result = response_result
|
22 |
+
self.message = message
|
23 |
+
self.set_status()
|
24 |
+
super(InfoNotProvidedException, self).__init__(message)
|
25 |
+
|
26 |
+
def set_status(self):
|
27 |
+
self.response_result["status"] = "Error"
|
28 |
+
self.response_result["message"][0] = "Information not provided."
|
29 |
+
self.response_result["message"].append(self.message)
|
30 |
+
|
31 |
+
def __repr__(self):
|
32 |
+
return f"exception.InfoNotProvidedException()"
|
33 |
+
|
34 |
+
|
35 |
+
class DataNotUploadedException(Exception):
|
36 |
+
def __init__(self, response_result: FrontendResponseModel):
|
37 |
+
self.response_result = response_result
|
38 |
+
self.set_status()
|
39 |
+
super(ModelDeploymentException, self).__init__()
|
40 |
+
|
41 |
+
def set_status(self):
|
42 |
+
self.response_result["status"] = "Error"
|
43 |
+
self.response_result["message"].append(
|
44 |
+
"Data not uploaded. Please upload a file."
|
45 |
+
)
|
46 |
+
|
47 |
+
def __repr__(self):
|
48 |
+
return f"exception.DataNotUploadedException()"
|
StudybotAPI/backend/core/__init__.py
ADDED
File without changes
|
StudybotAPI/backend/core/configEnv.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Config class for handling env variables.
|
2 |
+
"""
|
3 |
+
from functools import lru_cache
|
4 |
+
from pydantic import BaseSettings
|
5 |
+
|
6 |
+
|
7 |
+
class Settings(BaseSettings):
|
8 |
+
QDRANT_API_KEY: str
|
9 |
+
APP_ID: str
|
10 |
+
USER_ID: str
|
11 |
+
MODEL_ID: str
|
12 |
+
CLARIFAI_PAT: str
|
13 |
+
MODEL_VERSION_ID: str
|
14 |
+
|
15 |
+
class Config:
|
16 |
+
env_file = ".env"
|
17 |
+
|
18 |
+
|
19 |
+
@lru_cache()
|
20 |
+
def get_settings():
|
21 |
+
return Settings()
|
22 |
+
|
23 |
+
|
24 |
+
config = get_settings()
|
StudybotAPI/backend/ingestion/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .embeddings import Embeddings
|
2 |
+
from .prepare_data import *
|
3 |
+
from .preprocess_data import *
|
StudybotAPI/backend/ingestion/embeddings.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.vectorstores import Qdrant
|
2 |
+
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
|
5 |
+
import shutil
|
6 |
+
import warnings
|
7 |
+
from backend.ingestion import *
|
8 |
+
|
9 |
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
10 |
+
|
11 |
+
|
12 |
+
class Embeddings:
|
13 |
+
def __init__(self, cfg):
|
14 |
+
self.cfg = cfg
|
15 |
+
|
16 |
+
def split_docs(self, documents, chunk_size=1000, chunk_overlap=150):
|
17 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
18 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
19 |
+
)
|
20 |
+
docs = text_splitter.split_documents(documents)
|
21 |
+
return docs
|
22 |
+
|
23 |
+
def store_embeddings(self, docs):
|
24 |
+
embeddings = HuggingFaceBgeEmbeddings(
|
25 |
+
model_name=self.cfg.EMBEDDINGS,
|
26 |
+
model_kwargs={"device": self.cfg.DEVICE},
|
27 |
+
encode_kwargs={"normalize_embeddings": self.cfg.NORMALIZE_EMBEDDINGS}
|
28 |
+
# cache_folder = self.cfg.CACHE_FOLDER
|
29 |
+
)
|
30 |
+
|
31 |
+
shutil.rmtree(self.cfg.VECTOR_DB, ignore_errors=True)
|
32 |
+
|
33 |
+
texts = self.split_docs(docs)
|
34 |
+
|
35 |
+
vector_store = Qdrant.from_documents(
|
36 |
+
texts,
|
37 |
+
embeddings,
|
38 |
+
# path=self.cfg.VECTOR_DB,
|
39 |
+
location=":memory:",
|
40 |
+
# host="localhost",
|
41 |
+
# prefer_grpc=True,
|
42 |
+
collection_name=self.cfg.VECTOR_DB,
|
43 |
+
)
|
44 |
+
|
45 |
+
print(f"Vector store created at {self.cfg.VECTOR_DB}")
|
46 |
+
|
47 |
+
return vector_store
|
48 |
+
|
49 |
+
|
50 |
+
# with open("config.yml", "r", encoding="utf8") as ymlfile:
|
51 |
+
# cfg = box.Box(yaml.safe_load(ymlfile))
|
52 |
+
# emb = Embeddings(cfg)
|
53 |
+
|
54 |
+
# docs=PDFDataLoader(cfg.DATA_PATH).load()
|
55 |
+
# emb.store_embeddings(docs)
|
StudybotAPI/backend/ingestion/prepare_data.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Iterator
|
3 |
+
|
4 |
+
from langchain.schema import Document
|
5 |
+
from langchain.document_loaders.base import BaseLoader
|
6 |
+
from langchain.document_loaders import PyPDFLoader, TextLoader, DirectoryLoader
|
7 |
+
|
8 |
+
from .preprocess_data import make_texts_tokenisation_safe
|
9 |
+
|
10 |
+
|
11 |
+
class DataLoader(BaseLoader):
|
12 |
+
def __init__(self, data_dir: str):
|
13 |
+
self.data_dir = data_dir
|
14 |
+
self.metadata = {}
|
15 |
+
|
16 |
+
def lazy_load(self) -> Iterator[Document]:
|
17 |
+
raise NotImplementedError(
|
18 |
+
f"{self.__class__.__name__} does not implement lazy_load()"
|
19 |
+
)
|
20 |
+
|
21 |
+
def load(self):
|
22 |
+
documents = list(self.lazy_load())
|
23 |
+
self.metadata.update({"num_documents": len(documents)})
|
24 |
+
return documents
|
25 |
+
|
26 |
+
|
27 |
+
class PDFDataLoader(DataLoader):
|
28 |
+
def __init__(self, data_dir: str):
|
29 |
+
super().__init__(data_dir)
|
30 |
+
self.metadata = {
|
31 |
+
"data_dir": data_dir,
|
32 |
+
"loader": "PDFDataLoader",
|
33 |
+
"num_documents": None,
|
34 |
+
}
|
35 |
+
|
36 |
+
@make_texts_tokenisation_safe
|
37 |
+
def lazy_load(self) -> Iterator[Document]:
|
38 |
+
try:
|
39 |
+
# document = DirectoryLoader(
|
40 |
+
# self.data_dir, glob="*.pdf", loader_cls=PyPDFLoader
|
41 |
+
# ).load()
|
42 |
+
document = PyPDFLoader(self.data_dir).load()
|
43 |
+
for doc in document:
|
44 |
+
doc.metadata["file_type"] = os.path.splitext(doc.metadata["source"])[-1]
|
45 |
+
return document
|
46 |
+
|
47 |
+
except Exception as e:
|
48 |
+
print(f"Error loading : {e}")
|
49 |
+
return None
|
50 |
+
|
51 |
+
|
52 |
+
class TextDataLoader(DataLoader):
|
53 |
+
def __init__(self, data_dir: str):
|
54 |
+
super().__init__(data_dir)
|
55 |
+
self.metadata = {
|
56 |
+
"data_dir": data_dir,
|
57 |
+
"loader": "TextDataLoader",
|
58 |
+
"num_documents": None,
|
59 |
+
}
|
60 |
+
|
61 |
+
@make_texts_tokenisation_safe
|
62 |
+
def lazy_load(self) -> Iterator[Document]:
|
63 |
+
try:
|
64 |
+
document = DirectoryLoader(
|
65 |
+
self.data_dir, glob="*.txt", loader_cls=TextLoader
|
66 |
+
).load()
|
67 |
+
for doc in document:
|
68 |
+
doc.metadata["file_type"] = os.path.splitext(doc.metadata["source"])[-1]
|
69 |
+
return document
|
70 |
+
|
71 |
+
except Exception as e:
|
72 |
+
print(f"Error loading : {e}")
|
73 |
+
return None
|
74 |
+
|
75 |
+
|
76 |
+
# pdf_loader = PDFDataLoader(data_dir="E:/Projects/Hackathons/StudyBot/data")
|
77 |
+
|
78 |
+
# documents = pdf_loader.load()
|
79 |
+
# print(documents)
|
StudybotAPI/backend/ingestion/preprocess_data.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tiktoken
|
2 |
+
|
3 |
+
def make_texts_tokenisation_safe(func):
|
4 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
5 |
+
special_tokens_set = encoding.special_tokens_set
|
6 |
+
|
7 |
+
def remove_special_tokens(text):
|
8 |
+
for token in special_tokens_set:
|
9 |
+
text = text.replace(token, "")
|
10 |
+
return text
|
11 |
+
|
12 |
+
def wrapper(*args, **kwargs):
|
13 |
+
documents = func(*args, **kwargs)
|
14 |
+
for document in documents:
|
15 |
+
document.page_content = remove_special_tokens(document.page_content)
|
16 |
+
return documents
|
17 |
+
|
18 |
+
return wrapper
|
19 |
+
|
20 |
+
|
21 |
+
# def remove_whitespace(func):
|
22 |
+
# def wrapper(*args, **kwargs):
|
23 |
+
# result = func(*args, **kwargs)
|
24 |
+
# if isinstance(result, str):
|
25 |
+
# return result.replace(" ", "").replace("\t", "").replace("\n", "")
|
26 |
+
# elif isinstance(result, bytes):
|
27 |
+
# return (
|
28 |
+
# result.decode("utf-8")
|
29 |
+
# .replace(" ", "")
|
30 |
+
# .replace("\t", "")
|
31 |
+
# .replace("\n", "")
|
32 |
+
# )
|
33 |
+
# return result
|
34 |
+
|
35 |
+
# return wrapper
|
StudybotAPI/backend/retriever/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .ops import *
|
2 |
+
from .pipeline import *
|
StudybotAPI/backend/retriever/ops.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from backend.schemas import *
|
2 |
+
|
3 |
+
from backend.core.Exceptions import *
|
4 |
+
from backend.core.ExceptionHandlers import *
|
5 |
+
from backend import app
|
6 |
+
|
7 |
+
from clarifai_grpc.grpc.api.status import status_code_pb2
|
8 |
+
|
9 |
+
def ops_inference(response_result: FrontendResponseModel, question: str):
|
10 |
+
|
11 |
+
if question == "":
|
12 |
+
raise InfoNotProvidedException(response_result, "Come on, I'm not telepathic. I can't read your mind. Please provide me with a question.")
|
13 |
+
|
14 |
+
try:
|
15 |
+
llm_response = app.state.qa_chain(question)
|
16 |
+
output = Inference(
|
17 |
+
answer=llm_response["result"].strip(),
|
18 |
+
source_documents=llm_response["source_documents"],
|
19 |
+
)
|
20 |
+
|
21 |
+
response_result["result"] = output.dict()
|
22 |
+
except Exception as e:
|
23 |
+
response_result["status"] = "error"
|
24 |
+
response_result["message"].append(str(e))
|
25 |
+
raise ModelDeploymentException(response_result)
|
StudybotAPI/backend/retriever/pipeline.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
from transformers import (
|
4 |
+
BitsAndBytesConfig,
|
5 |
+
AutoModelForCausalLM,
|
6 |
+
AutoTokenizer,
|
7 |
+
GenerationConfig,
|
8 |
+
pipeline,
|
9 |
+
)
|
10 |
+
|
11 |
+
import warnings
|
12 |
+
|
13 |
+
warnings.filterwarnings("ignore")
|
14 |
+
|
15 |
+
|
16 |
+
class EmbeddingModel:
|
17 |
+
def __init__(self, model_name, generation_config):
|
18 |
+
self.model_name = model_name
|
19 |
+
self.generation_config = generation_config
|
20 |
+
self.tokenizer = self._initialize_tokenizer()
|
21 |
+
self.model = self._initialize_model()
|
22 |
+
|
23 |
+
def _initialize_tokenizer(self):
|
24 |
+
tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
|
25 |
+
tokenizer.pad_token = tokenizer.eos_token
|
26 |
+
return tokenizer
|
27 |
+
|
28 |
+
def _initialize_model(self):
|
29 |
+
quantization_config = BitsAndBytesConfig(
|
30 |
+
load_in_4bit=True,
|
31 |
+
bnb_4bit_compute_dtype=torch.float16,
|
32 |
+
bnb_4bit_quant_type="nf4",
|
33 |
+
bnb_4bit_use_double_quant=True,
|
34 |
+
)
|
35 |
+
|
36 |
+
model = AutoModelForCausalLM.from_pretrained(
|
37 |
+
self.model_name,
|
38 |
+
torch_dtype=torch.float16,
|
39 |
+
trust_remote_code=True,
|
40 |
+
device_map="auto",
|
41 |
+
quantization_config=quantization_config,
|
42 |
+
)
|
43 |
+
return model
|
44 |
+
|
45 |
+
def _initialize_generation_config(self):
|
46 |
+
generation_config = GenerationConfig.from_pretrained(self.model_name)
|
47 |
+
generation_config.max_new_tokens = 1024
|
48 |
+
generation_config.temperature = 0.0001
|
49 |
+
generation_config.top_p = 0.95
|
50 |
+
generation_config.do_sample = True
|
51 |
+
generation_config.repetition_penalty = 1.15
|
52 |
+
return generation_config
|
53 |
+
|
54 |
+
def _initialize_pipeline(self):
|
55 |
+
pipeline_obj = pipeline(
|
56 |
+
"text-generation",
|
57 |
+
model=self.model,
|
58 |
+
tokenizer=self.tokenizer,
|
59 |
+
return_full_text=True,
|
60 |
+
generation_config=self.generation_config,
|
61 |
+
)
|
62 |
+
return pipeline_obj
|
63 |
+
|
64 |
+
|
65 |
+
# if __name__ == "__main__":
|
66 |
+
# MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
|
67 |
+
|
68 |
+
# text_generator = MyTextGenerator(
|
69 |
+
# MODEL_NAME, MyTextGenerator._initialize_generation_config()
|
70 |
+
# )
|
71 |
+
# generated_text = text_generator.generate_text()
|
72 |
+
# print(generated_text)
|
StudybotAPI/backend/routes.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Callable
|
5 |
+
from tempfile import NamedTemporaryFile
|
6 |
+
|
7 |
+
from fastapi import Request, BackgroundTasks
|
8 |
+
from fastapi import UploadFile
|
9 |
+
from fastapi.middleware.cors import CORSMiddleware
|
10 |
+
|
11 |
+
from backend import app
|
12 |
+
from backend.schemas import *
|
13 |
+
from backend.retriever import *
|
14 |
+
from backend.utils import *
|
15 |
+
|
16 |
+
|
17 |
+
app.add_middleware(
|
18 |
+
CORSMiddleware,
|
19 |
+
allow_origins=["*"],
|
20 |
+
allow_credentials=True,
|
21 |
+
allow_methods=["*"],
|
22 |
+
allow_headers=["*"],
|
23 |
+
)
|
24 |
+
|
25 |
+
|
26 |
+
@app.get("/", tags=["Home"])
|
27 |
+
def api_home():
|
28 |
+
return {"detail": "Welcome to Studybot API"}
|
29 |
+
|
30 |
+
|
31 |
+
@app.post("/api/upload", response_model=DataResponseModel,summary="Upload", tags=["Resource Server"])
|
32 |
+
async def upload_data(file: UploadFile = File(...)):
|
33 |
+
|
34 |
+
response_result = {
|
35 |
+
"status": "success",
|
36 |
+
"message": ["Data uploaded successfully."]
|
37 |
+
}
|
38 |
+
|
39 |
+
try:
|
40 |
+
suffix = Path(file.filename).suffix
|
41 |
+
with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
42 |
+
shutil.copyfileobj(file.file, tmp)
|
43 |
+
tmp_path = os.path.join(os.getcwd(), tmp.name)
|
44 |
+
except Exception as e:
|
45 |
+
response_result["status"] = "error"
|
46 |
+
response_result["message"][0]=str(e)
|
47 |
+
raise DataNotUploadedException(response_result)
|
48 |
+
|
49 |
+
finally:
|
50 |
+
file.file.close()
|
51 |
+
|
52 |
+
await llm_chain_loader(DATA_PATH=tmp_path)
|
53 |
+
return response_result
|
54 |
+
|
55 |
+
|
56 |
+
@app.post("/api/inference",summary="Inference",response_model=FrontendResponseModel,tags=["Resource Server"])
|
57 |
+
def inference(data: Chat):
|
58 |
+
response_result = {
|
59 |
+
"status": "success",
|
60 |
+
"message": [""],
|
61 |
+
"result": {}
|
62 |
+
}
|
63 |
+
|
64 |
+
ops_inference(response_result, data.promptMessage)
|
65 |
+
return response_result
|
StudybotAPI/backend/schemas/FrontendResponseSchema.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
from pydantic import BaseModel
|
3 |
+
|
4 |
+
|
5 |
+
class FrontendResponseModel(BaseModel):
|
6 |
+
status: str
|
7 |
+
message: List[str]
|
8 |
+
result: dict
|
9 |
+
|
10 |
+
class DataResponseModel(BaseModel):
|
11 |
+
status: str
|
12 |
+
message: List[str]
|
StudybotAPI/backend/schemas/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .chatschema import *
|
2 |
+
from .FrontendResponseSchema import *
|
StudybotAPI/backend/schemas/chatschema.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel
|
2 |
+
from fastapi import File, UploadFile
|
3 |
+
|
4 |
+
|
5 |
+
class Inference(BaseModel):
|
6 |
+
answer: str
|
7 |
+
source_documents: list
|
8 |
+
|
9 |
+
class Chat(BaseModel):
|
10 |
+
promptMessage: str
|
11 |
+
|
12 |
+
class Upload(BaseModel):
|
13 |
+
file : UploadFile
|
StudybotAPI/backend/utils/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .chain_loader import *
|
StudybotAPI/backend/utils/chain_loader.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from backend.ingestion import *
|
2 |
+
from backend import app
|
3 |
+
from backend.core.configEnv import config
|
4 |
+
|
5 |
+
from langchain.chains import (
|
6 |
+
LLMChain,
|
7 |
+
SimpleSequentialChain,
|
8 |
+
RetrievalQA,
|
9 |
+
ConversationalRetrievalChain,
|
10 |
+
)
|
11 |
+
from langchain.llms import Clarifai
|
12 |
+
from langchain.prompts import PromptTemplate
|
13 |
+
|
14 |
+
|
15 |
+
async def llm_chain_loader(DATA_PATH: str):
|
16 |
+
docs = PDFDataLoader(DATA_PATH).load()
|
17 |
+
db = app.state.emb.store_embeddings(docs)
|
18 |
+
|
19 |
+
with open("backend/utils/prompt.txt", "r", encoding="utf8") as f:
|
20 |
+
prompt = f.read()
|
21 |
+
|
22 |
+
prompt = PromptTemplate(template=prompt, input_variables=["context", "question"])
|
23 |
+
|
24 |
+
llm = Clarifai(
|
25 |
+
pat=config.CLARIFAI_PAT,
|
26 |
+
user_id=config.USER_ID,
|
27 |
+
app_id=config.APP_ID,
|
28 |
+
model_id=config.MODEL_ID,
|
29 |
+
model_version_id=config.MODEL_VERSION_ID,
|
30 |
+
)
|
31 |
+
|
32 |
+
qa_chain = RetrievalQA.from_chain_type(
|
33 |
+
llm=llm,
|
34 |
+
chain_type="stuff",
|
35 |
+
retriever=db.as_retriever(search_type="similarity",search_kwargs={"k": 2}),
|
36 |
+
return_source_documents=True,
|
37 |
+
chain_type_kwargs={"prompt": prompt},
|
38 |
+
)
|
39 |
+
|
40 |
+
app.state.qa_chain = qa_chain
|
StudybotAPI/backend/utils/prompt.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[INST] <>
|
2 |
+
You are a revolutionary educational AI bot to assist students in quick revisions of theoretical subjects, which often involve numerous concepts, dates, and important events. You need to answer such that it aids in recalling key information for efficient study sessions. If you do not know the answer reply with 'I am sorry, I dont have enough information.
|
3 |
+
ALWAYS return a "SOURCES" part in your answer.
|
4 |
+
The "SOURCES" part should be a reference to the source of the document from which you got your answer.
|
5 |
+
<>
|
6 |
+
|
7 |
+
{context}
|
8 |
+
|
9 |
+
Consider a student engaged in the study of any theoretical subject, where the abundance of concepts and events poses a challenge to memorization. The aim is to overcome this hurdle and be capable of providing brief answers to specific queries. For example, if a student forgets a key concept, date, or event, they can ask the bot a question like "What is [specific query]?" for a concise answer.
|
10 |
+
Note that students can also ask multiple questions in a single query. For example, "What is [specific query 1]?, What is [specific query 2]?, What is [specific query 3]?".
|
11 |
+
|
12 |
+
{question} [/INST]
|
StudybotAPI/backend/utils/prompts.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
social_sciences_teacher = """
|
2 |
+
[INST] <>
|
3 |
+
You are a revolutionary educational AI bot to assist students in quick revisions of theoretical subjects, which often involve numerous concepts, dates, and important events. You need to answer such that it aids in recalling key information for efficient study sessions. If you do not know the answer reply with 'I am sorry, I dont have enough information.
|
4 |
+
ALWAYS return a "SOURCES" part in your answer.
|
5 |
+
The "SOURCES" part should be a reference to the source of the document from which you got your answer.
|
6 |
+
<>
|
7 |
+
|
8 |
+
{context}
|
9 |
+
|
10 |
+
Consider a student engaged in the study of any theoretical subject, where the abundance of concepts and events poses a challenge to memorization. The aim is to overcome this hurdle and be capable of providing brief answers to specific queries. For example, if a student forgets a key concept, date, or event, they can ask the bot a question like "What is [specific query]?" for a concise answer.
|
11 |
+
Note that students can also ask multiple questions in a single query. For example, "What is [specific query 1]?, What is [specific query 2]?, What is [specific query 3]?".
|
12 |
+
|
13 |
+
{question} [/INST]
|
14 |
+
"""
|
15 |
+
|
16 |
+
|
17 |
+
english_teacher = """
|
18 |
+
[INST] <>
|
19 |
+
You are an innovative language learning AI designed to assist students in improving their English language skills. Your goal is to provide helpful and engaging responses that aid in language comprehension, grammar, and vocabulary building. If you are unable to answer a question, respond with 'I am sorry, I don't have enough information.'
|
20 |
+
ALWAYS include a "RESOURCES" section in your answer, providing guidance on where students can find additional information or practice materials.
|
21 |
+
<>
|
22 |
+
|
23 |
+
{context}
|
24 |
+
|
25 |
+
Imagine a student immersed in the study of the English language, aiming to enhance their understanding of grammar, vocabulary, and overall language proficiency. Your role is to assist them by providing concise and informative answers to specific language-related queries. For instance, if a student is uncertain about a grammatical rule or the meaning of a word, they can ask you questions like "What is [specific query]?" for clear and detailed responses.
|
26 |
+
Keep in mind that students may pose multiple questions within a single query. For example, "What is the meaning of [specific query 1]?, How do I use [specific query 2] in a sentence?, Can you explain the grammar rule for [specific query 3]?".
|
27 |
+
|
28 |
+
{question} [/INST]
|
29 |
+
"""
|
30 |
+
|
31 |
+
|
32 |
+
science_teacher = """
|
33 |
+
[INST] <>
|
34 |
+
You are a cutting-edge science education AI, dedicated to helping students grasp complex scientific concepts and theories. Your mission is to provide insightful and understandable explanations to foster a deeper understanding of various scientific topics. If you don't have enough information to answer a question, respond with 'I am sorry, I don't have enough information.'
|
35 |
+
Always include a "REFERENCES" section in your answer, directing students to relevant sources for further exploration.
|
36 |
+
<>
|
37 |
+
|
38 |
+
{context}
|
39 |
+
|
40 |
+
Envision a student engrossed in the study of science, navigating through intricate theories and phenomena. Your purpose is to assist them by offering clear and concise answers to their scientific inquiries. For instance, if a student is struggling to comprehend a particular scientific concept or needs clarification on an experiment, they can pose questions like "What is [specific query]?" for detailed responses.
|
41 |
+
It's important to note that students might present multiple questions within a single query. For instance, "Explain [specific query 1]?, How does [specific query 2] work?, Can you provide examples of [specific query 3]?".
|
42 |
+
|
43 |
+
{question} [/INST]
|
44 |
+
"""
|
45 |
+
|
46 |
+
prompt_infos = [
|
47 |
+
{
|
48 |
+
"name": "Math Teacher",
|
49 |
+
"description": "Good for answering questions about Social Sciences subject like History, Geography, Civics, etc.",
|
50 |
+
"prompt_template": social_sciences_teacher,
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"name": "Spanish Teacher",
|
54 |
+
"description": "Good for answering questions about English Language",
|
55 |
+
"prompt_template": english_teacher,
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"name": "Calculus Teacher",
|
59 |
+
"description": "Good for answering questions about Science subjects like Physics, Chemistry, Biology, etc.",
|
60 |
+
"prompt_template": science_teacher,
|
61 |
+
},
|
62 |
+
]
|
StudybotAPI/config.yml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CHUNK_SIZE: 1000
|
2 |
+
CHUNK_OVERLAP: 100
|
3 |
+
NUM_RESULTS: 3
|
4 |
+
EMBEDDINGS: "BAAI/llm-embedder"
|
5 |
+
VECTOR_DB: "./vectorstore/studybotstore"
|
6 |
+
NORMALIZE_EMBEDDINGS: True
|
7 |
+
COLLECTION_NAME: "studybotstore"
|
8 |
+
DEVICE: "cpu"
|
9 |
+
VECTOR_SPACE: "cosine"
|
10 |
+
CACHE_FOLDER: "./cache"
|
StudybotAPI/requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.99.1
|
2 |
+
uvicorn
|
3 |
+
requests
|
4 |
+
langchain==0.0.346
|
5 |
+
pydantic==1.10.2
|
6 |
+
pypdf
|
7 |
+
python-box
|
8 |
+
qdrant-client
|
9 |
+
torch
|
10 |
+
transformers
|
11 |
+
sentence_transformers
|
12 |
+
clarifai
|
13 |
+
Pillow
|
14 |
+
tiktoken
|
15 |
+
python-multipart
|
frontend/components/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .authors import *
|
2 |
+
from .user_greetings import *
|
frontend/components/authors.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
|
5 |
+
def authors():
|
6 |
+
st.sidebar.divider()
|
7 |
+
st.sidebar.info(
|
8 |
+
"""
|
9 |
+
Follow me on:
|
10 |
+
|
11 |
+
Github → [@HemanthSai7](https://github.com/HemanthSai7)
|
12 |
+
"""
|
13 |
+
)
|
frontend/components/user_greetings.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def user_greetings():
|
4 |
+
with st.sidebar.expander("👋 Greetings!", expanded=True):
|
5 |
+
st.write("Welcome to Studybot! This is a tool to help you revise your subjects. You can use the sidebar to navigate to the different pages. Have fun!")
|
6 |
+
st.write("If you have any feedback, please contact me on [LinkedIn](https://www.linkedin.com/in/hemanthsai7/) or [GitHub](https://github.com/HemanthSai7).")
|
frontend/layouts/__init__.py
ADDED
File without changes
|
frontend/layouts/mainlayout.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import streamlit as st
|
3 |
+
from typing import Callable
|
4 |
+
from components import authors, user_greetings
|
5 |
+
|
6 |
+
|
7 |
+
def mainlayout(func: Callable):
|
8 |
+
def wrapper():
|
9 |
+
with open("layouts/st_page_layouts.json", "r", encoding="utf-8") as f:
|
10 |
+
st_page_layouts = json.load(f)
|
11 |
+
|
12 |
+
st.set_page_config(**st_page_layouts[f"{func.__name__}" if func.__name__ in st_page_layouts.keys() else "home"])
|
13 |
+
st.markdown('# :rainbow[Welcome to Studybot]🚀')
|
14 |
+
user_greetings()
|
15 |
+
authors()
|
16 |
+
|
17 |
+
func()
|
18 |
+
|
19 |
+
return wrapper
|
frontend/layouts/st_page_layouts.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"home": {
|
3 |
+
"page_title": "Studybot",
|
4 |
+
"layout": "wide",
|
5 |
+
"page_icon": "🏡",
|
6 |
+
"initial_sidebar_state": "expanded"
|
7 |
+
},
|
8 |
+
"bot": {
|
9 |
+
"page_title": "Chatbot",
|
10 |
+
"layout": "wide",
|
11 |
+
"page_icon": "💻",
|
12 |
+
"initial_sidebar_state": "expanded"
|
13 |
+
}
|
14 |
+
}
|
frontend/pages/2_🤖_bot.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from layouts.mainlayout import mainlayout
|
4 |
+
|
5 |
+
@mainlayout
|
6 |
+
def bot():
|
7 |
+
st.subheader("Revise your subjects with Studybot!")
|
8 |
+
|
9 |
+
bot()
|
frontend/requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
streamlit
|
frontend/test.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
import requests
|
4 |
+
|
5 |
+
# from studybot import qa_chain
|
6 |
+
# st.write(qa_chain)
|
7 |
+
|
8 |
+
upload_pdf = st.file_uploader("Upload PDF", type="pdf")
|
9 |
+
if upload_pdf is not None:
|
10 |
+
files = {"file": upload_pdf}
|
11 |
+
response = requests.post(
|
12 |
+
"https://hemanthsai7-studybotapi.hf.space/api/upload", files=files
|
13 |
+
)
|
14 |
+
st.write(response)
|
15 |
+
|
16 |
+
query = st.text_input("Question", key="question")
|
17 |
+
st.write(st.session_state)
|
18 |
+
|
19 |
+
if st.button("Ask"):
|
20 |
+
# answer = qa_chain(query)
|
21 |
+
answer = requests.post(
|
22 |
+
"https://hemanthsai7-studybotapi.hf.space/api/inference",
|
23 |
+
json={"promptMessage": query},
|
24 |
+
).json()
|
25 |
+
st.write(answer)
|
26 |
+
# st.session_state["question"] = ""
|
frontend/🏡_Home.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from layouts.mainlayout import mainlayout
|
4 |
+
|
5 |
+
|
6 |
+
@mainlayout
|
7 |
+
def home():
|
8 |
+
st.subheader("Revise your subjects with Studybot!")
|
9 |
+
|
10 |
+
st.markdown(
|
11 |
+
"""
|
12 |
+
<p style='text-align: justify;'>
|
13 |
+
<b>Studybot</b> is a <b>free</b> and <b>open-source</b> tool for <b>automated</b> and <b>personalized</b> learning.
|
14 |
+
It is a <b>chatbot</b> that helps you to <b>learn</b> and <b>memorize</b> new information in a <b>fun</b> and <b>easy</b> way.
|
15 |
+
Studybot is <b>free</b> and <b>open-source</b>, so you can use it <b>without any limitations</b> and <b>without any costs</b>.
|
16 |
+
You can also <b>customize</b> it to your needs and <b>contribute</b> to its development.
|
17 |
+
<b>Enjoy</b> your learning with Studybot!
|
18 |
+
</p>
|
19 |
+
""",
|
20 |
+
unsafe_allow_html=True,
|
21 |
+
)
|
22 |
+
|
23 |
+
with st.expander("How does it work?", expanded=True):
|
24 |
+
st.markdown(
|
25 |
+
"""
|
26 |
+
- When you upload a document, it will be divided into smaller chunks and stored in a special type of database called a vector index that allows for semantic search and retrieval. I'm using Qdrant vector database for this purpose.
|
27 |
+
|
28 |
+
- When you ask a question, Studybot will search through the document chunks and find the most relevant ones using the vector index. Then, it will use Mistral-7B-instruct to generate a final answer.
|
29 |
+
|
30 |
+
"""
|
31 |
+
)
|
32 |
+
|
33 |
+
with st.expander("FAQs 🤔"):
|
34 |
+
st.markdown(
|
35 |
+
"""
|
36 |
+
- **What is the best way to upload a document?**<br>
|
37 |
+
The best way to upload a document is to upload a PDF file. Studybot will automatically divide it into smaller chunks. You can also upload a text file. In this case, Studybot will divide it into smaller chunks.
|
38 |
+
|
39 |
+
- **What is the best way to ask questions?**<br>
|
40 |
+
The best way to ask questions is to ask questions that are related to the document you uploaded. If you ask questions that are not related to the document you uploaded, Studybot will not be able to answer them.
|
41 |
+
|
42 |
+
- **Is my data safe?**<br>
|
43 |
+
Yes, your data is safe. Studybot does not store your documents or questions. All uploaded data is deleted after you close the browser tab since it is stored in the RAM memory. However, if you want to be sure that your data is safe, you can use the `Clear data 🧹` button to delete all uploaded data.
|
44 |
+
|
45 |
+
- **Why does it take so long to index my document?**<br>
|
46 |
+
When you upload a document, it is divided into smaller chunks and stored in a special type of database called a vector index that allows for semantic search and retrieval. It takes some time to index your document because it has to be divided into smaller chunks and stored in the vector index. However, once your document is indexed, it will be much faster to search through it.
|
47 |
+
|
48 |
+
- **Are the answers 100% accurate?**<br>
|
49 |
+
- No, the answers are not 100% accurate. Studybot uses Mistral-7B to generate answers. Mistral-7B is a powerful language model, but it sometimes makes mistakes and is prone to hallucinations. Also, Studybot uses semantic search to find the most relevant chunks and does not see the entire document, which means that it may not be able to find all the relevant information and may not be able to answer all questions (especially summary-type questions or questions that require a lot of context from the document).
|
50 |
+
|
51 |
+
- But for most of the time, Studybot is very accurate and can answer most questions. Always check with the sources to make sure that the answers are correct.
|
52 |
+
|
53 |
+
- **What is the best way to contribute to Studybot?**<br>
|
54 |
+
The best way to contribute to Studybot is to create an issue on GitHub. I will be happy to answer your questions and help you with your contributions.
|
55 |
+
""",
|
56 |
+
unsafe_allow_html=True,
|
57 |
+
)
|
58 |
+
|
59 |
+
st.divider()
|
60 |
+
# architecture heading in the middle
|
61 |
+
st.markdown("<h2 style='text-align: center; color: black;'>Studybot Architecture</h1>", unsafe_allow_html=True)
|
62 |
+
st.image("images/architecture.png")
|
63 |
+
|
64 |
+
|
65 |
+
home()
|
notebooks/Untitled6.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/embeddings.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/selfrag.ipynb
ADDED
File without changes
|
test.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class A:
|
2 |
+
def __init__(self):
|
3 |
+
self._bill = 1
|
4 |
+
|
5 |
+
@property
|
6 |
+
def bill(self):
|
7 |
+
return self._bill
|
8 |
+
|
9 |
+
@bill.setter
|
10 |
+
def bill(self,value):
|
11 |
+
self._bill = value
|
12 |
+
# raise PermissionError("You can't change the bill")
|
13 |
+
|
14 |
+
class B(A):
|
15 |
+
def __init__(self):
|
16 |
+
super().__init__()
|
17 |
+
self._bill = 2
|
18 |
+
|
19 |
+
# b = B()
|
20 |
+
# print(b.bill)
|
21 |
+
a=A()
|
22 |
+
a.bill=3
|
23 |
+
print(a.bill)
|
24 |
+
|
25 |
+
|
26 |
+
|