Spaces:
Running
Running
Commit
·
7e24b41
1
Parent(s):
e26a49a
INITIAL COMMIT
Browse files- .gitignore +162 -0
- app.py +310 -7
- config.ini +25 -0
- params.yaml +19 -0
- requirements.txt +15 -0
- secrets.env +1 -0
- setup.py +15 -0
- src/__init__.py +0 -0
- src/components/__init__.py +0 -0
- src/components/loaders/__init__.py +0 -0
- src/components/loaders/pdfLoader.py +42 -0
- src/components/loaders/websiteCrawler.py +65 -0
- src/components/loaders/youtubeLoader.py +24 -0
- src/components/rag/RAG.py +42 -0
- src/components/rag/__init__.py +0 -0
- src/components/vectors/__init__.py +0 -0
- src/components/vectors/vectorstore.py +38 -0
- src/pipelines/__init__.py +0 -0
- src/pipelines/completePipeline.py +39 -0
- src/utils/__init__.py +0 -0
- src/utils/exceptions.py +16 -0
- src/utils/functions.py +17 -0
- src/utils/logging.py +12 -0
.gitignore
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
110 |
+
.pdm.toml
|
111 |
+
.pdm-python
|
112 |
+
.pdm-build/
|
113 |
+
|
114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
115 |
+
__pypackages__/
|
116 |
+
|
117 |
+
# Celery stuff
|
118 |
+
celerybeat-schedule
|
119 |
+
celerybeat.pid
|
120 |
+
|
121 |
+
# SageMath parsed files
|
122 |
+
*.sage.py
|
123 |
+
|
124 |
+
# Environments
|
125 |
+
.env
|
126 |
+
.venv
|
127 |
+
env/
|
128 |
+
venv/
|
129 |
+
ENV/
|
130 |
+
env.bak/
|
131 |
+
venv.bak/
|
132 |
+
|
133 |
+
# Spyder project settings
|
134 |
+
.spyderproject
|
135 |
+
.spyproject
|
136 |
+
|
137 |
+
# Rope project settings
|
138 |
+
.ropeproject
|
139 |
+
|
140 |
+
# mkdocs documentation
|
141 |
+
/site
|
142 |
+
|
143 |
+
# mypy
|
144 |
+
.mypy_cache/
|
145 |
+
.dmypy.json
|
146 |
+
dmypy.json
|
147 |
+
|
148 |
+
# Pyre type checker
|
149 |
+
.pyre/
|
150 |
+
|
151 |
+
# pytype static type analyzer
|
152 |
+
.pytype/
|
153 |
+
|
154 |
+
# Cython debug symbols
|
155 |
+
cython_debug/
|
156 |
+
|
157 |
+
# PyCharm
|
158 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
159 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
160 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
+
#.idea/
|
app.py
CHANGED
@@ -1,7 +1,310 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.pipelines.completePipeline import Pipeline
|
2 |
+
import gradio as gr
|
3 |
+
import os
|
4 |
+
|
5 |
+
# os.system("apt-get update -y")
|
6 |
+
# os.system("apt-get upgrade -y")
|
7 |
+
# os.system("apt install poppler-utils -y")
|
8 |
+
|
9 |
+
chain = None
|
10 |
+
pipeline = Pipeline()
|
11 |
+
|
12 |
+
|
13 |
+
def getTextResponse(text: str, inputQuery: str):
|
14 |
+
global chain
|
15 |
+
if chain is None:
|
16 |
+
chain = pipeline.plainText(text = text)
|
17 |
+
else:
|
18 |
+
pass
|
19 |
+
response = chain.invoke(
|
20 |
+
{
|
21 |
+
"question": inputQuery
|
22 |
+
}
|
23 |
+
)
|
24 |
+
return response
|
25 |
+
|
26 |
+
|
27 |
+
def getSearchablePdfResponse(path: str, inputQuery: str):
|
28 |
+
global chain
|
29 |
+
if chain is None:
|
30 |
+
chain = pipeline.searchablePdf(path = path)
|
31 |
+
else:
|
32 |
+
pass
|
33 |
+
response = chain.invoke(
|
34 |
+
{
|
35 |
+
"question": inputQuery
|
36 |
+
}
|
37 |
+
)
|
38 |
+
return response
|
39 |
+
|
40 |
+
def getScannablePdfResponse(path: str, inputQuery: str):
|
41 |
+
global chain
|
42 |
+
if chain is None:
|
43 |
+
chain = pipeline.scannablePdf(path = path)
|
44 |
+
else:
|
45 |
+
pass
|
46 |
+
response = chain.invoke(
|
47 |
+
{
|
48 |
+
"question": inputQuery
|
49 |
+
}
|
50 |
+
)
|
51 |
+
return response
|
52 |
+
|
53 |
+
def clearFunction():
|
54 |
+
global chain
|
55 |
+
chain = None
|
56 |
+
|
57 |
+
with gr.Blocks() as textInterface:
|
58 |
+
with gr.Row():
|
59 |
+
inputText = gr.Textbox(
|
60 |
+
label = "Input Text",
|
61 |
+
placeholder = "Enter you text here"
|
62 |
+
)
|
63 |
+
with gr.Row():
|
64 |
+
question = gr.Textbox(
|
65 |
+
label = "Question",
|
66 |
+
placeholder = "Enter your question here"
|
67 |
+
)
|
68 |
+
answer = gr.Textbox(
|
69 |
+
label = "Response",
|
70 |
+
interactive = False
|
71 |
+
)
|
72 |
+
with gr.Row():
|
73 |
+
submitButton = gr.Button(
|
74 |
+
value = "Submit",
|
75 |
+
variant = "primary"
|
76 |
+
)
|
77 |
+
clearButton = gr.ClearButton(
|
78 |
+
components = [inputText, question, answer],
|
79 |
+
value = "Clear",
|
80 |
+
variant = "secondary"
|
81 |
+
)
|
82 |
+
submitButton.click(
|
83 |
+
fn = getTextResponse,
|
84 |
+
inputs = [inputText, question],
|
85 |
+
outputs = [answer]
|
86 |
+
)
|
87 |
+
clearButton.click(
|
88 |
+
fn = clearFunction
|
89 |
+
)
|
90 |
+
|
91 |
+
|
92 |
+
with gr.Blocks() as searchablePdf:
|
93 |
+
with gr.Row():
|
94 |
+
inputFile = gr.File(
|
95 |
+
file_types = [".pdf"],
|
96 |
+
file_count = "single",
|
97 |
+
label = "Select PDF"
|
98 |
+
)
|
99 |
+
with gr.Row():
|
100 |
+
question = gr.Textbox(
|
101 |
+
label = "Question",
|
102 |
+
placeholder = "Enter your question here"
|
103 |
+
)
|
104 |
+
answer = gr.Textbox(
|
105 |
+
label = "Response",
|
106 |
+
interactive = False
|
107 |
+
)
|
108 |
+
with gr.Row():
|
109 |
+
submitButton = gr.Button(
|
110 |
+
value = "Submit",
|
111 |
+
variant = "primary"
|
112 |
+
)
|
113 |
+
clearButton = gr.ClearButton(
|
114 |
+
components = [inputFile, question, answer],
|
115 |
+
value = "Clear",
|
116 |
+
variant = "secondary"
|
117 |
+
)
|
118 |
+
submitButton.click(
|
119 |
+
fn = getSearchablePdfResponse,
|
120 |
+
inputs = [inputFile, question],
|
121 |
+
outputs = [answer]
|
122 |
+
)
|
123 |
+
clearButton.click(
|
124 |
+
fn = clearFunction
|
125 |
+
)
|
126 |
+
|
127 |
+
|
128 |
+
with gr.Blocks() as scannablePdf:
|
129 |
+
with gr.Row():
|
130 |
+
inputFile = gr.File(
|
131 |
+
file_types = [".pdf"],
|
132 |
+
file_count = "single",
|
133 |
+
label = "Select PDF"
|
134 |
+
)
|
135 |
+
with gr.Row():
|
136 |
+
question = gr.Textbox(
|
137 |
+
label = "Question",
|
138 |
+
placeholder = "Enter your question here"
|
139 |
+
)
|
140 |
+
answer = gr.Textbox(
|
141 |
+
label = "Response",
|
142 |
+
interactive = False
|
143 |
+
)
|
144 |
+
with gr.Row():
|
145 |
+
submitButton = gr.Button(
|
146 |
+
value = "Submit",
|
147 |
+
variant = "primary"
|
148 |
+
)
|
149 |
+
clearButton = gr.ClearButton(
|
150 |
+
components = [inputFile, question, answer],
|
151 |
+
value = "Clear",
|
152 |
+
variant = "secondary"
|
153 |
+
)
|
154 |
+
submitButton.click(
|
155 |
+
fn = getScannablePdfResponse,
|
156 |
+
inputs = [inputFile, question],
|
157 |
+
outputs = [answer]
|
158 |
+
)
|
159 |
+
clearButton.click(
|
160 |
+
fn = clearFunction
|
161 |
+
)
|
162 |
+
|
163 |
+
|
164 |
+
def getLinksButtonFn(baseUrl: str):
|
165 |
+
links = pipeline.webCrawler.getLinks(url = baseUrl)
|
166 |
+
checkboxes = gr.CheckboxGroup(
|
167 |
+
choices = links,
|
168 |
+
label = "Fetched Links",
|
169 |
+
visible = True
|
170 |
+
)
|
171 |
+
row2 = gr.Row(visible = True)
|
172 |
+
row3 = gr.Row(visible = True)
|
173 |
+
return (
|
174 |
+
checkboxes,
|
175 |
+
row2,
|
176 |
+
row3
|
177 |
+
)
|
178 |
+
|
179 |
+
def getWebsiteResponse(links: list[str], inputQuery: str):
|
180 |
+
global chain
|
181 |
+
if chain is None:
|
182 |
+
print(links)
|
183 |
+
chain = pipeline.webCrawl(urls = links)
|
184 |
+
else:
|
185 |
+
pass
|
186 |
+
response = chain.invoke(
|
187 |
+
{
|
188 |
+
"question": inputQuery
|
189 |
+
}
|
190 |
+
)
|
191 |
+
return response
|
192 |
+
|
193 |
+
def clearWebsiteResponse():
|
194 |
+
global chain
|
195 |
+
chain = None
|
196 |
+
checkboxes = gr.CheckboxGroup(
|
197 |
+
choices = [],
|
198 |
+
label = "Fetched Links",
|
199 |
+
visible = False
|
200 |
+
)
|
201 |
+
return checkboxes
|
202 |
+
|
203 |
+
with gr.Blocks() as websiteCrawler:
|
204 |
+
with gr.Row():
|
205 |
+
inputUrl = gr.Textbox(
|
206 |
+
label = "Base URL",
|
207 |
+
placeholder = "Enter the Base URL to fetch other links",
|
208 |
+
scale = 3
|
209 |
+
)
|
210 |
+
getLinksButton = gr.Button(
|
211 |
+
value = "Get Links",
|
212 |
+
variant = "primary",
|
213 |
+
scale = 1
|
214 |
+
)
|
215 |
+
checkboxes = gr.CheckboxGroup(
|
216 |
+
choices = [],
|
217 |
+
label = "Fetched Links",
|
218 |
+
)
|
219 |
+
with gr.Row(visible = False) as row2:
|
220 |
+
question = gr.Textbox(
|
221 |
+
label = "Question",
|
222 |
+
placeholder = "Enter your question here"
|
223 |
+
)
|
224 |
+
answer = gr.Textbox(
|
225 |
+
label = "Response",
|
226 |
+
interactive = False
|
227 |
+
)
|
228 |
+
with gr.Row(visible = False) as row3:
|
229 |
+
submitButton = gr.Button(
|
230 |
+
value = "Submit",
|
231 |
+
variant = "primary"
|
232 |
+
)
|
233 |
+
clearButton = gr.ClearButton(
|
234 |
+
components = [question, answer],
|
235 |
+
value = "Clear",
|
236 |
+
variant = "secondary"
|
237 |
+
)
|
238 |
+
getLinksButton.click(
|
239 |
+
fn = getLinksButtonFn,
|
240 |
+
inputs = [inputUrl],
|
241 |
+
outputs = [checkboxes, row2, row3]
|
242 |
+
)
|
243 |
+
submitButton.click(
|
244 |
+
fn = getWebsiteResponse,
|
245 |
+
inputs = [checkboxes, question],
|
246 |
+
outputs = [answer]
|
247 |
+
)
|
248 |
+
clearButton.click(
|
249 |
+
fn = clearWebsiteResponse,
|
250 |
+
inputs = None,
|
251 |
+
outputs = [checkboxes]
|
252 |
+
)
|
253 |
+
|
254 |
+
|
255 |
+
def getYoutubeResponse(links: str, inputQuery: str):
|
256 |
+
global chain
|
257 |
+
links = [link.strip() for link in links.split(",")]
|
258 |
+
if chain is None:
|
259 |
+
chain = pipeline.youtubeLinks(urls = links)
|
260 |
+
else:
|
261 |
+
pass
|
262 |
+
response = chain.invoke(
|
263 |
+
{
|
264 |
+
"question": inputQuery
|
265 |
+
}
|
266 |
+
)
|
267 |
+
return response
|
268 |
+
|
269 |
+
|
270 |
+
with gr.Blocks() as youtubeInterface:
|
271 |
+
with gr.Row():
|
272 |
+
inputLinks = gr.Textbox(
|
273 |
+
label = "Youtube Links",
|
274 |
+
placeholder = 'Enter comma(,)-separated youtube video links'
|
275 |
+
)
|
276 |
+
with gr.Row():
|
277 |
+
question = gr.Textbox(
|
278 |
+
label = "Question",
|
279 |
+
placeholder = "Enter your question here"
|
280 |
+
)
|
281 |
+
answer = gr.Textbox(
|
282 |
+
label = "Response",
|
283 |
+
interactive = False
|
284 |
+
)
|
285 |
+
with gr.Row():
|
286 |
+
submitButton = gr.Button(
|
287 |
+
value = "Submit",
|
288 |
+
variant = "primary"
|
289 |
+
)
|
290 |
+
clearButton = gr.ClearButton(
|
291 |
+
components = [inputLinks, question, answer],
|
292 |
+
value = "Clear",
|
293 |
+
variant = "secondary"
|
294 |
+
)
|
295 |
+
submitButton.click(
|
296 |
+
fn = getYoutubeResponse,
|
297 |
+
inputs = [inputLinks, question],
|
298 |
+
outputs = [answer]
|
299 |
+
)
|
300 |
+
clearButton.click(
|
301 |
+
fn = clearFunction
|
302 |
+
)
|
303 |
+
|
304 |
+
|
305 |
+
application = gr.TabbedInterface(
|
306 |
+
[textInterface, searchablePdf, scannablePdf, websiteCrawler, youtubeInterface],
|
307 |
+
["Text", "Searchable PDF", "Scannable PDF", "Website Text", "Youtube Transcripts"]
|
308 |
+
)
|
309 |
+
|
310 |
+
application.launch()
|
config.ini
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[EMBEDDINGS]
|
2 |
+
embeddingModel = sentence-transformers/all-MiniLM-L6-v2
|
3 |
+
device = cpu
|
4 |
+
normalize_embeddings = true
|
5 |
+
|
6 |
+
[VECTORSTORE]
|
7 |
+
chunkSize = 1250
|
8 |
+
chunkOverlap = 250
|
9 |
+
addStartIndex = true
|
10 |
+
|
11 |
+
[LLM]
|
12 |
+
llmModel = llama-3.1-70b-versatile
|
13 |
+
maxTokens = 512
|
14 |
+
temperature = 0.75
|
15 |
+
|
16 |
+
[RETRIEVER]
|
17 |
+
searchType = mmr
|
18 |
+
k = 5
|
19 |
+
fetchK = 10
|
20 |
+
|
21 |
+
[WEBCRAWLER]
|
22 |
+
timeout = 30
|
23 |
+
|
24 |
+
[EASYOCR]
|
25 |
+
gpu = false
|
params.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
prompt: |
|
2 |
+
INSTRUCTIONS:
|
3 |
+
=====================================
|
4 |
+
### Role
|
5 |
+
**Primary Function**: You are an AI chatbot designed to provide accurate and efficient assistance to users based on provided context data. Your responses must be reliable, friendly, and directly address user inquiries or issues. Always clarify any unclear questions, and conclude responses positively.
|
6 |
+
### Constraints
|
7 |
+
1. **No Data Disclosure**: Never reveal access to training data or any context explicitly.
|
8 |
+
2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
|
9 |
+
3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
|
10 |
+
4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
|
11 |
+
Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Make sure the user is always happy and satisfied with the outputs you return.
|
12 |
+
CONTEXT:
|
13 |
+
=====================================
|
14 |
+
{context}
|
15 |
+
======================================
|
16 |
+
QUESTION:
|
17 |
+
=====================================
|
18 |
+
{question}
|
19 |
+
NOTE: Generate responses directly without using phrases like "Response:" or "Answer:". NEVER mention the user about usage of any context to generate an answer.
|
requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
langchain
|
3 |
+
langchain-core
|
4 |
+
langchain-groq
|
5 |
+
langchain-community
|
6 |
+
langchain_huggingface
|
7 |
+
pymupdf
|
8 |
+
easyocr
|
9 |
+
numpy
|
10 |
+
pdf2image
|
11 |
+
requests
|
12 |
+
python-dotenv
|
13 |
+
beautifulsoup4
|
14 |
+
youtube-transcript-api
|
15 |
+
urllib3
|
secrets.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
GROQ_API_KEY=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
|
setup.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup, find_packages
|
2 |
+
|
3 |
+
def getRequirements(requirementsPath: str) -> list[str]:
|
4 |
+
with open(requirementsPath) as file:
|
5 |
+
requirements = file.read().split("\n")
|
6 |
+
return requirements
|
7 |
+
|
8 |
+
setup(
|
9 |
+
name = "ConversAI",
|
10 |
+
author = "Rauhan Ahmed Siddiqui",
|
11 |
+
author_email = "rauhaan.siddiqui@gmail.com",
|
12 |
+
version = "0.1",
|
13 |
+
packages = find_packages(),
|
14 |
+
install_requires = getRequirements(requirementsPath = "requirements.txt")
|
15 |
+
)
|
src/__init__.py
ADDED
File without changes
|
src/components/__init__.py
ADDED
File without changes
|
src/components/loaders/__init__.py
ADDED
File without changes
|
src/components/loaders/pdfLoader.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.utils.functions import cleanText, getConfig
|
2 |
+
from concurrent.futures import ThreadPoolExecutor
|
3 |
+
from src.utils.exceptions import CustomException
|
4 |
+
from pdf2image import convert_from_path
|
5 |
+
from src.utils.logging import logger
|
6 |
+
import numpy as np
|
7 |
+
import pymupdf
|
8 |
+
import easyocr
|
9 |
+
|
10 |
+
|
11 |
+
class PdfLoader:
|
12 |
+
def __init__(self) -> None:
|
13 |
+
self.config = getConfig(path = "config.ini")
|
14 |
+
self.reader = easyocr.Reader(['en'], gpu = self.config.getboolean("EASYOCR", "gpu"))
|
15 |
+
|
16 |
+
def extractTextFromPage(self, page):
|
17 |
+
return cleanText(text = page.get_text())
|
18 |
+
|
19 |
+
def searchablePdf(self, pdfPath: str):
|
20 |
+
try:
|
21 |
+
logger.info("Text Extraction Started from Searchable PDF")
|
22 |
+
doc = pymupdf.open(pdfPath)
|
23 |
+
pages = [doc.load_page(i) for i in range(len(doc))]
|
24 |
+
with ThreadPoolExecutor() as executor:
|
25 |
+
texts = list(executor.map(self.extractTextFromPage, pages))
|
26 |
+
doc.close()
|
27 |
+
return "\n".join(texts)
|
28 |
+
except Exception as e:
|
29 |
+
logger.error(CustomException(e))
|
30 |
+
|
31 |
+
def getText(self, image):
|
32 |
+
text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)])
|
33 |
+
return cleanText(text = text)
|
34 |
+
|
35 |
+
def scannablePdf(self, pdfPath: str):
|
36 |
+
try:
|
37 |
+
logger.info("Text Extraction Started from Scannable PDF")
|
38 |
+
allImages = convert_from_path(pdfPath)
|
39 |
+
texts = [self.getText(image) for image in allImages]
|
40 |
+
return "\n".join(texts)
|
41 |
+
except Exception as e:
|
42 |
+
logger.error(CustomException(e))
|
src/components/loaders/websiteCrawler.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from concurrent.futures import ThreadPoolExecutor
|
2 |
+
from src.utils.exceptions import CustomException
|
3 |
+
from urllib.parse import urlparse, urljoin
|
4 |
+
from src.utils.functions import getConfig
|
5 |
+
from src.utils.functions import cleanText
|
6 |
+
from src.utils.logging import logger
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
import time
|
9 |
+
import requests
|
10 |
+
|
11 |
+
|
12 |
+
class WebsiteCrawler:
|
13 |
+
def __init__(self):
|
14 |
+
self.config = getConfig(path = "config.ini")
|
15 |
+
|
16 |
+
def getLinksFromPage(self, url: str):
|
17 |
+
response = requests.get(url)
|
18 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
19 |
+
anchors = soup.find_all("a")
|
20 |
+
links = []
|
21 |
+
for anchor in anchors:
|
22 |
+
if "href" in anchor.attrs:
|
23 |
+
if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
|
24 |
+
links.append(anchor.attrs["href"])
|
25 |
+
elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
|
26 |
+
links.append(urljoin(url + "/", anchor.attrs["href"]))
|
27 |
+
else:
|
28 |
+
pass
|
29 |
+
links = [link for link in links if "#" not in link]
|
30 |
+
links = list(set(links))
|
31 |
+
else:
|
32 |
+
continue
|
33 |
+
return links
|
34 |
+
|
35 |
+
def getLinks(self, url: str):
|
36 |
+
try:
|
37 |
+
logger.info("fetching links from url")
|
38 |
+
start = time.time()
|
39 |
+
links = self.getLinksFromPage(url)
|
40 |
+
uniqueLinks = set()
|
41 |
+
for link in links:
|
42 |
+
now = time.time()
|
43 |
+
if now - start > self.config.getint("WEBCRAWLER", "timeout"):
|
44 |
+
break
|
45 |
+
else:
|
46 |
+
uniqueLinks = uniqueLinks.union(set(self.getLinksFromPage(link)))
|
47 |
+
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
48 |
+
except Exception as e:
|
49 |
+
logger.error(CustomException(e))
|
50 |
+
|
51 |
+
def extractTextFromUrl(self, url: str):
|
52 |
+
response = requests.get(url)
|
53 |
+
response.raise_for_status()
|
54 |
+
html = response.text
|
55 |
+
soup = BeautifulSoup(html, 'html.parser')
|
56 |
+
return cleanText(text = soup.get_text(separator=' ', strip=True))
|
57 |
+
|
58 |
+
def extractTextFromUrlList(self, urls: list[str]):
|
59 |
+
try:
|
60 |
+
logger.info("extracting text from urls")
|
61 |
+
with ThreadPoolExecutor() as executor:
|
62 |
+
texts = list(executor.map(self.extractTextFromUrl, urls))
|
63 |
+
return "\n".join(texts)
|
64 |
+
except Exception as e:
|
65 |
+
logger.error(CustomException(e))
|
src/components/loaders/youtubeLoader.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import YoutubeLoader
|
2 |
+
from src.utils.exceptions import CustomException
|
3 |
+
from src.utils.functions import cleanText
|
4 |
+
from src.utils.logging import logger
|
5 |
+
|
6 |
+
|
7 |
+
class YoutubeTranscriptLoader:
|
8 |
+
def __init__(self):
|
9 |
+
pass
|
10 |
+
|
11 |
+
def getTranscripts(self, urls: str):
|
12 |
+
texts = []
|
13 |
+
for url in set(urls):
|
14 |
+
try:
|
15 |
+
loader = YoutubeLoader.from_youtube_url(
|
16 |
+
url, add_video_info=False
|
17 |
+
)
|
18 |
+
doc = " ".join([x.page_content for x in loader.load()])
|
19 |
+
texts.append(cleanText(text = doc))
|
20 |
+
except Exception as e:
|
21 |
+
logger.error(CustomException(e))
|
22 |
+
doc = ""
|
23 |
+
texts.append(doc)
|
24 |
+
return "\n".join(texts)
|
src/components/rag/RAG.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.components.vectors.vectorstore import VectorStore
|
2 |
+
from langchain_core.output_parsers import StrOutputParser
|
3 |
+
from langchain_core.prompts import ChatPromptTemplate
|
4 |
+
from langchain_core.runnables import RunnableLambda
|
5 |
+
from src.utils.exceptions import CustomException
|
6 |
+
from src.utils.functions import getConfig
|
7 |
+
from src.utils.functions import loadYaml
|
8 |
+
from src.utils.logging import logger
|
9 |
+
from langchain_groq import ChatGroq
|
10 |
+
|
11 |
+
|
12 |
+
class Chain:
|
13 |
+
def __init__(self):
|
14 |
+
self.config = getConfig(path = "config.ini")
|
15 |
+
self.store = VectorStore()
|
16 |
+
prompt = loadYaml(path = "params.yaml")["prompt"]
|
17 |
+
self.prompt = ChatPromptTemplate.from_template(prompt)
|
18 |
+
|
19 |
+
def formatDocs(self, docs):
|
20 |
+
context = ""
|
21 |
+
for doc in docs:
|
22 |
+
context += f"{doc}\n\n\n"
|
23 |
+
if context == "":
|
24 |
+
context = "No Context Found"
|
25 |
+
else:
|
26 |
+
pass
|
27 |
+
return context
|
28 |
+
|
29 |
+
def returnChain(self, text: str):
|
30 |
+
try:
|
31 |
+
logger.info("preparing chain")
|
32 |
+
store = self.store.setupStore(text = text)
|
33 |
+
chain = (
|
34 |
+
{"context": RunnableLambda(lambda x: x["question"]) | store | RunnableLambda(self.formatDocs),
|
35 |
+
"question": RunnableLambda(lambda x: x["question"])}
|
36 |
+
| self.prompt
|
37 |
+
| ChatGroq(model_name = self.config.get("LLM", "llmModel"), temperature = self.config.getfloat("LLM", "temperature"), max_tokens = self.config.getint("LLM", "maxTokens"))
|
38 |
+
| StrOutputParser()
|
39 |
+
)
|
40 |
+
return chain
|
41 |
+
except Exception as e:
|
42 |
+
logger.error(CustomException(e))
|
src/components/rag/__init__.py
ADDED
File without changes
|
src/components/vectors/__init__.py
ADDED
File without changes
|
src/components/vectors/vectorstore.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
2 |
+
from langchain_core.vectorstores import InMemoryVectorStore
|
3 |
+
from langchain_community.docstore.document import Document
|
4 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
+
from src.utils.exceptions import CustomException
|
6 |
+
from src.utils.functions import getConfig
|
7 |
+
from src.utils.logging import logger
|
8 |
+
|
9 |
+
class VectorStore:
|
10 |
+
def __init__(self):
|
11 |
+
self.config = getConfig(path = "config.ini")
|
12 |
+
self.vectorEmbeddings = HuggingFaceEmbeddings(
|
13 |
+
model_name = self.config.get("EMBEDDINGS", "embeddingModel"),
|
14 |
+
model_kwargs = {"device": self.config.get("EMBEDDINGS", "device")},
|
15 |
+
encode_kwargs = {"normalize_embeddings": self.config.getboolean("EMBEDDINGS", "normalize_embeddings")}
|
16 |
+
)
|
17 |
+
self.splitter = RecursiveCharacterTextSplitter(
|
18 |
+
chunk_size = self.config.getint("VECTORSTORE", "chunkSize"),
|
19 |
+
chunk_overlap = self.config.getint("VECTORSTORE", "chunkOverlap"),
|
20 |
+
add_start_index = self.config.getboolean("VECTORSTORE", "addStartIndex")
|
21 |
+
)
|
22 |
+
|
23 |
+
def setupStore(self, text: str):
|
24 |
+
try:
|
25 |
+
store = InMemoryVectorStore(self.vectorEmbeddings)
|
26 |
+
textDocument = Document(page_content = text)
|
27 |
+
documents = self.splitter.split_documents([textDocument])
|
28 |
+
store.add_documents(documents = documents)
|
29 |
+
return store.as_retriever(
|
30 |
+
search_type = self.config.get("RETRIEVER", "searchType"),
|
31 |
+
search_kwargs = {
|
32 |
+
"k": self.config.getint("RETRIEVER", "k"),
|
33 |
+
"fetch_k": self.config.getint("RETRIEVER", "fetchK")
|
34 |
+
}
|
35 |
+
)
|
36 |
+
except Exception as e:
|
37 |
+
print(CustomException(e))
|
38 |
+
logger.error(CustomException(e))
|
src/pipelines/__init__.py
ADDED
File without changes
|
src/pipelines/completePipeline.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.components.loaders.websiteCrawler import WebsiteCrawler
|
2 |
+
from src.components.loaders.youtubeLoader import YoutubeTranscriptLoader
|
3 |
+
from src.components.loaders.pdfLoader import PdfLoader
|
4 |
+
from src.components.rag.RAG import Chain
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
load_dotenv("secrets.env")
|
8 |
+
|
9 |
+
class Pipeline:
|
10 |
+
def __init__(self):
|
11 |
+
self.pdfLoader = PdfLoader()
|
12 |
+
self.webCrawler = WebsiteCrawler()
|
13 |
+
self.youtubeLoader = YoutubeTranscriptLoader()
|
14 |
+
self.ragChain = Chain()
|
15 |
+
|
16 |
+
def plainText(self, text: str):
|
17 |
+
chain = self.ragChain.returnChain(text = text)
|
18 |
+
return chain
|
19 |
+
|
20 |
+
def searchablePdf(self, path: str):
|
21 |
+
extractedText = self.pdfLoader.searchablePdf(pdfPath = path)
|
22 |
+
chain = self.ragChain.returnChain(text = extractedText)
|
23 |
+
return chain
|
24 |
+
|
25 |
+
def scannablePdf(self, path: str):
|
26 |
+
extractedText = self.pdfLoader.scannablePdf(pdfPath = path)
|
27 |
+
chain = self.ragChain.returnChain(text = extractedText)
|
28 |
+
return chain
|
29 |
+
|
30 |
+
def webCrawl(self, urls: list[str]):
|
31 |
+
extractedText = self.webCrawler.extractTextFromUrlList(urls = urls)
|
32 |
+
chain = self.ragChain.returnChain(text = extractedText)
|
33 |
+
return chain
|
34 |
+
|
35 |
+
def youtubeLinks(self, urls: list[str]):
|
36 |
+
extractedText = self.youtubeLoader.getTranscripts(urls = urls)
|
37 |
+
print(extractedText)
|
38 |
+
chain = self.ragChain.returnChain(text = extractedText)
|
39 |
+
return chain
|
src/utils/__init__.py
ADDED
File without changes
|
src/utils/exceptions.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
def error_message_detail(error):
|
4 |
+
_, _, exc_info = sys.exc_info()
|
5 |
+
filename = exc_info.tb_frame.f_code.co_filename
|
6 |
+
lineno = exc_info.tb_lineno
|
7 |
+
error_message = "Error encountered in line no [{}], filename : [{}], saying [{}]".format(lineno, filename, error)
|
8 |
+
return error_message
|
9 |
+
|
10 |
+
class CustomException(Exception):
|
11 |
+
def __init__(self, error_message):
|
12 |
+
super().__init__(error_message)
|
13 |
+
self.error_message = error_message_detail(error_message)
|
14 |
+
|
15 |
+
def __str__(self) -> str:
|
16 |
+
return self.error_message
|
src/utils/functions.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import configparser
|
2 |
+
import string
|
3 |
+
import yaml
|
4 |
+
|
5 |
+
def getConfig(path: str):
|
6 |
+
config = configparser.ConfigParser()
|
7 |
+
config.read(path)
|
8 |
+
return config
|
9 |
+
|
10 |
+
def cleanText(text: str):
|
11 |
+
text = text.replace("\n", " ")
|
12 |
+
text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
|
13 |
+
return text
|
14 |
+
|
15 |
+
def loadYaml(path: str):
|
16 |
+
with open(path) as file:
|
17 |
+
return yaml.safe_load(file)
|
src/utils/logging.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
logger = logging.getLogger(__name__)
|
4 |
+
logger.setLevel(logging.INFO)
|
5 |
+
|
6 |
+
logFormat = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
|
7 |
+
logFormatter = logging.Formatter(fmt = logFormat, style = "%")
|
8 |
+
|
9 |
+
streamHandler = logging.StreamHandler()
|
10 |
+
streamHandler.setFormatter(logFormatter)
|
11 |
+
|
12 |
+
logger.addHandler(streamHandler)
|