techconsptrs commited on
Commit
7e24b41
·
1 Parent(s): e26a49a

INITIAL COMMIT

Browse files
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
app.py CHANGED
@@ -1,7 +1,310 @@
1
- import gradio as gr
2
-
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
-
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.pipelines.completePipeline import Pipeline
2
+ import gradio as gr
3
+ import os
4
+
5
+ # os.system("apt-get update -y")
6
+ # os.system("apt-get upgrade -y")
7
+ # os.system("apt install poppler-utils -y")
8
+
9
+ chain = None
10
+ pipeline = Pipeline()
11
+
12
+
13
+ def getTextResponse(text: str, inputQuery: str):
14
+ global chain
15
+ if chain is None:
16
+ chain = pipeline.plainText(text = text)
17
+ else:
18
+ pass
19
+ response = chain.invoke(
20
+ {
21
+ "question": inputQuery
22
+ }
23
+ )
24
+ return response
25
+
26
+
27
+ def getSearchablePdfResponse(path: str, inputQuery: str):
28
+ global chain
29
+ if chain is None:
30
+ chain = pipeline.searchablePdf(path = path)
31
+ else:
32
+ pass
33
+ response = chain.invoke(
34
+ {
35
+ "question": inputQuery
36
+ }
37
+ )
38
+ return response
39
+
40
+ def getScannablePdfResponse(path: str, inputQuery: str):
41
+ global chain
42
+ if chain is None:
43
+ chain = pipeline.scannablePdf(path = path)
44
+ else:
45
+ pass
46
+ response = chain.invoke(
47
+ {
48
+ "question": inputQuery
49
+ }
50
+ )
51
+ return response
52
+
53
+ def clearFunction():
54
+ global chain
55
+ chain = None
56
+
57
+ with gr.Blocks() as textInterface:
58
+ with gr.Row():
59
+ inputText = gr.Textbox(
60
+ label = "Input Text",
61
+ placeholder = "Enter you text here"
62
+ )
63
+ with gr.Row():
64
+ question = gr.Textbox(
65
+ label = "Question",
66
+ placeholder = "Enter your question here"
67
+ )
68
+ answer = gr.Textbox(
69
+ label = "Response",
70
+ interactive = False
71
+ )
72
+ with gr.Row():
73
+ submitButton = gr.Button(
74
+ value = "Submit",
75
+ variant = "primary"
76
+ )
77
+ clearButton = gr.ClearButton(
78
+ components = [inputText, question, answer],
79
+ value = "Clear",
80
+ variant = "secondary"
81
+ )
82
+ submitButton.click(
83
+ fn = getTextResponse,
84
+ inputs = [inputText, question],
85
+ outputs = [answer]
86
+ )
87
+ clearButton.click(
88
+ fn = clearFunction
89
+ )
90
+
91
+
92
+ with gr.Blocks() as searchablePdf:
93
+ with gr.Row():
94
+ inputFile = gr.File(
95
+ file_types = [".pdf"],
96
+ file_count = "single",
97
+ label = "Select PDF"
98
+ )
99
+ with gr.Row():
100
+ question = gr.Textbox(
101
+ label = "Question",
102
+ placeholder = "Enter your question here"
103
+ )
104
+ answer = gr.Textbox(
105
+ label = "Response",
106
+ interactive = False
107
+ )
108
+ with gr.Row():
109
+ submitButton = gr.Button(
110
+ value = "Submit",
111
+ variant = "primary"
112
+ )
113
+ clearButton = gr.ClearButton(
114
+ components = [inputFile, question, answer],
115
+ value = "Clear",
116
+ variant = "secondary"
117
+ )
118
+ submitButton.click(
119
+ fn = getSearchablePdfResponse,
120
+ inputs = [inputFile, question],
121
+ outputs = [answer]
122
+ )
123
+ clearButton.click(
124
+ fn = clearFunction
125
+ )
126
+
127
+
128
+ with gr.Blocks() as scannablePdf:
129
+ with gr.Row():
130
+ inputFile = gr.File(
131
+ file_types = [".pdf"],
132
+ file_count = "single",
133
+ label = "Select PDF"
134
+ )
135
+ with gr.Row():
136
+ question = gr.Textbox(
137
+ label = "Question",
138
+ placeholder = "Enter your question here"
139
+ )
140
+ answer = gr.Textbox(
141
+ label = "Response",
142
+ interactive = False
143
+ )
144
+ with gr.Row():
145
+ submitButton = gr.Button(
146
+ value = "Submit",
147
+ variant = "primary"
148
+ )
149
+ clearButton = gr.ClearButton(
150
+ components = [inputFile, question, answer],
151
+ value = "Clear",
152
+ variant = "secondary"
153
+ )
154
+ submitButton.click(
155
+ fn = getScannablePdfResponse,
156
+ inputs = [inputFile, question],
157
+ outputs = [answer]
158
+ )
159
+ clearButton.click(
160
+ fn = clearFunction
161
+ )
162
+
163
+
164
+ def getLinksButtonFn(baseUrl: str):
165
+ links = pipeline.webCrawler.getLinks(url = baseUrl)
166
+ checkboxes = gr.CheckboxGroup(
167
+ choices = links,
168
+ label = "Fetched Links",
169
+ visible = True
170
+ )
171
+ row2 = gr.Row(visible = True)
172
+ row3 = gr.Row(visible = True)
173
+ return (
174
+ checkboxes,
175
+ row2,
176
+ row3
177
+ )
178
+
179
+ def getWebsiteResponse(links: list[str], inputQuery: str):
180
+ global chain
181
+ if chain is None:
182
+ print(links)
183
+ chain = pipeline.webCrawl(urls = links)
184
+ else:
185
+ pass
186
+ response = chain.invoke(
187
+ {
188
+ "question": inputQuery
189
+ }
190
+ )
191
+ return response
192
+
193
+ def clearWebsiteResponse():
194
+ global chain
195
+ chain = None
196
+ checkboxes = gr.CheckboxGroup(
197
+ choices = [],
198
+ label = "Fetched Links",
199
+ visible = False
200
+ )
201
+ return checkboxes
202
+
203
+ with gr.Blocks() as websiteCrawler:
204
+ with gr.Row():
205
+ inputUrl = gr.Textbox(
206
+ label = "Base URL",
207
+ placeholder = "Enter the Base URL to fetch other links",
208
+ scale = 3
209
+ )
210
+ getLinksButton = gr.Button(
211
+ value = "Get Links",
212
+ variant = "primary",
213
+ scale = 1
214
+ )
215
+ checkboxes = gr.CheckboxGroup(
216
+ choices = [],
217
+ label = "Fetched Links",
218
+ )
219
+ with gr.Row(visible = False) as row2:
220
+ question = gr.Textbox(
221
+ label = "Question",
222
+ placeholder = "Enter your question here"
223
+ )
224
+ answer = gr.Textbox(
225
+ label = "Response",
226
+ interactive = False
227
+ )
228
+ with gr.Row(visible = False) as row3:
229
+ submitButton = gr.Button(
230
+ value = "Submit",
231
+ variant = "primary"
232
+ )
233
+ clearButton = gr.ClearButton(
234
+ components = [question, answer],
235
+ value = "Clear",
236
+ variant = "secondary"
237
+ )
238
+ getLinksButton.click(
239
+ fn = getLinksButtonFn,
240
+ inputs = [inputUrl],
241
+ outputs = [checkboxes, row2, row3]
242
+ )
243
+ submitButton.click(
244
+ fn = getWebsiteResponse,
245
+ inputs = [checkboxes, question],
246
+ outputs = [answer]
247
+ )
248
+ clearButton.click(
249
+ fn = clearWebsiteResponse,
250
+ inputs = None,
251
+ outputs = [checkboxes]
252
+ )
253
+
254
+
255
+ def getYoutubeResponse(links: str, inputQuery: str):
256
+ global chain
257
+ links = [link.strip() for link in links.split(",")]
258
+ if chain is None:
259
+ chain = pipeline.youtubeLinks(urls = links)
260
+ else:
261
+ pass
262
+ response = chain.invoke(
263
+ {
264
+ "question": inputQuery
265
+ }
266
+ )
267
+ return response
268
+
269
+
270
+ with gr.Blocks() as youtubeInterface:
271
+ with gr.Row():
272
+ inputLinks = gr.Textbox(
273
+ label = "Youtube Links",
274
+ placeholder = 'Enter comma(,)-separated youtube video links'
275
+ )
276
+ with gr.Row():
277
+ question = gr.Textbox(
278
+ label = "Question",
279
+ placeholder = "Enter your question here"
280
+ )
281
+ answer = gr.Textbox(
282
+ label = "Response",
283
+ interactive = False
284
+ )
285
+ with gr.Row():
286
+ submitButton = gr.Button(
287
+ value = "Submit",
288
+ variant = "primary"
289
+ )
290
+ clearButton = gr.ClearButton(
291
+ components = [inputLinks, question, answer],
292
+ value = "Clear",
293
+ variant = "secondary"
294
+ )
295
+ submitButton.click(
296
+ fn = getYoutubeResponse,
297
+ inputs = [inputLinks, question],
298
+ outputs = [answer]
299
+ )
300
+ clearButton.click(
301
+ fn = clearFunction
302
+ )
303
+
304
+
305
+ application = gr.TabbedInterface(
306
+ [textInterface, searchablePdf, scannablePdf, websiteCrawler, youtubeInterface],
307
+ ["Text", "Searchable PDF", "Scannable PDF", "Website Text", "Youtube Transcripts"]
308
+ )
309
+
310
+ application.launch()
config.ini ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [EMBEDDINGS]
2
+ embeddingModel = sentence-transformers/all-MiniLM-L6-v2
3
+ device = cpu
4
+ normalize_embeddings = true
5
+
6
+ [VECTORSTORE]
7
+ chunkSize = 1250
8
+ chunkOverlap = 250
9
+ addStartIndex = true
10
+
11
+ [LLM]
12
+ llmModel = llama-3.1-70b-versatile
13
+ maxTokens = 512
14
+ temperature = 0.75
15
+
16
+ [RETRIEVER]
17
+ searchType = mmr
18
+ k = 5
19
+ fetchK = 10
20
+
21
+ [WEBCRAWLER]
22
+ timeout = 30
23
+
24
+ [EASYOCR]
25
+ gpu = false
params.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prompt: |
2
+ INSTRUCTIONS:
3
+ =====================================
4
+ ### Role
5
+ **Primary Function**: You are an AI chatbot designed to provide accurate and efficient assistance to users based on provided context data. Your responses must be reliable, friendly, and directly address user inquiries or issues. Always clarify any unclear questions, and conclude responses positively.
6
+ ### Constraints
7
+ 1. **No Data Disclosure**: Never reveal access to training data or any context explicitly.
8
+ 2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
9
+ 3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
10
+ 4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
11
+ Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Make sure the user is always happy and satisfied with the outputs you return.
12
+ CONTEXT:
13
+ =====================================
14
+ {context}
15
+ ======================================
16
+ QUESTION:
17
+ =====================================
18
+ {question}
19
+ NOTE: Generate responses directly without using phrases like "Response:" or "Answer:". NEVER mention the user about usage of any context to generate an answer.
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ langchain
3
+ langchain-core
4
+ langchain-groq
5
+ langchain-community
6
+ langchain_huggingface
7
+ pymupdf
8
+ easyocr
9
+ numpy
10
+ pdf2image
11
+ requests
12
+ python-dotenv
13
+ beautifulsoup4
14
+ youtube-transcript-api
15
+ urllib3
secrets.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GROQ_API_KEY=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
setup.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ def getRequirements(requirementsPath: str) -> list[str]:
4
+ with open(requirementsPath) as file:
5
+ requirements = file.read().split("\n")
6
+ return requirements
7
+
8
+ setup(
9
+ name = "ConversAI",
10
+ author = "Rauhan Ahmed Siddiqui",
11
+ author_email = "rauhaan.siddiqui@gmail.com",
12
+ version = "0.1",
13
+ packages = find_packages(),
14
+ install_requires = getRequirements(requirementsPath = "requirements.txt")
15
+ )
src/__init__.py ADDED
File without changes
src/components/__init__.py ADDED
File without changes
src/components/loaders/__init__.py ADDED
File without changes
src/components/loaders/pdfLoader.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.utils.functions import cleanText, getConfig
2
+ from concurrent.futures import ThreadPoolExecutor
3
+ from src.utils.exceptions import CustomException
4
+ from pdf2image import convert_from_path
5
+ from src.utils.logging import logger
6
+ import numpy as np
7
+ import pymupdf
8
+ import easyocr
9
+
10
+
11
+ class PdfLoader:
12
+ def __init__(self) -> None:
13
+ self.config = getConfig(path = "config.ini")
14
+ self.reader = easyocr.Reader(['en'], gpu = self.config.getboolean("EASYOCR", "gpu"))
15
+
16
+ def extractTextFromPage(self, page):
17
+ return cleanText(text = page.get_text())
18
+
19
+ def searchablePdf(self, pdfPath: str):
20
+ try:
21
+ logger.info("Text Extraction Started from Searchable PDF")
22
+ doc = pymupdf.open(pdfPath)
23
+ pages = [doc.load_page(i) for i in range(len(doc))]
24
+ with ThreadPoolExecutor() as executor:
25
+ texts = list(executor.map(self.extractTextFromPage, pages))
26
+ doc.close()
27
+ return "\n".join(texts)
28
+ except Exception as e:
29
+ logger.error(CustomException(e))
30
+
31
+ def getText(self, image):
32
+ text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)])
33
+ return cleanText(text = text)
34
+
35
+ def scannablePdf(self, pdfPath: str):
36
+ try:
37
+ logger.info("Text Extraction Started from Scannable PDF")
38
+ allImages = convert_from_path(pdfPath)
39
+ texts = [self.getText(image) for image in allImages]
40
+ return "\n".join(texts)
41
+ except Exception as e:
42
+ logger.error(CustomException(e))
src/components/loaders/websiteCrawler.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from concurrent.futures import ThreadPoolExecutor
2
+ from src.utils.exceptions import CustomException
3
+ from urllib.parse import urlparse, urljoin
4
+ from src.utils.functions import getConfig
5
+ from src.utils.functions import cleanText
6
+ from src.utils.logging import logger
7
+ from bs4 import BeautifulSoup
8
+ import time
9
+ import requests
10
+
11
+
12
+ class WebsiteCrawler:
13
+ def __init__(self):
14
+ self.config = getConfig(path = "config.ini")
15
+
16
+ def getLinksFromPage(self, url: str):
17
+ response = requests.get(url)
18
+ soup = BeautifulSoup(response.content, "html.parser")
19
+ anchors = soup.find_all("a")
20
+ links = []
21
+ for anchor in anchors:
22
+ if "href" in anchor.attrs:
23
+ if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
24
+ links.append(anchor.attrs["href"])
25
+ elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
26
+ links.append(urljoin(url + "/", anchor.attrs["href"]))
27
+ else:
28
+ pass
29
+ links = [link for link in links if "#" not in link]
30
+ links = list(set(links))
31
+ else:
32
+ continue
33
+ return links
34
+
35
+ def getLinks(self, url: str):
36
+ try:
37
+ logger.info("fetching links from url")
38
+ start = time.time()
39
+ links = self.getLinksFromPage(url)
40
+ uniqueLinks = set()
41
+ for link in links:
42
+ now = time.time()
43
+ if now - start > self.config.getint("WEBCRAWLER", "timeout"):
44
+ break
45
+ else:
46
+ uniqueLinks = uniqueLinks.union(set(self.getLinksFromPage(link)))
47
+ return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
48
+ except Exception as e:
49
+ logger.error(CustomException(e))
50
+
51
+ def extractTextFromUrl(self, url: str):
52
+ response = requests.get(url)
53
+ response.raise_for_status()
54
+ html = response.text
55
+ soup = BeautifulSoup(html, 'html.parser')
56
+ return cleanText(text = soup.get_text(separator=' ', strip=True))
57
+
58
+ def extractTextFromUrlList(self, urls: list[str]):
59
+ try:
60
+ logger.info("extracting text from urls")
61
+ with ThreadPoolExecutor() as executor:
62
+ texts = list(executor.map(self.extractTextFromUrl, urls))
63
+ return "\n".join(texts)
64
+ except Exception as e:
65
+ logger.error(CustomException(e))
src/components/loaders/youtubeLoader.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import YoutubeLoader
2
+ from src.utils.exceptions import CustomException
3
+ from src.utils.functions import cleanText
4
+ from src.utils.logging import logger
5
+
6
+
7
+ class YoutubeTranscriptLoader:
8
+ def __init__(self):
9
+ pass
10
+
11
+ def getTranscripts(self, urls: str):
12
+ texts = []
13
+ for url in set(urls):
14
+ try:
15
+ loader = YoutubeLoader.from_youtube_url(
16
+ url, add_video_info=False
17
+ )
18
+ doc = " ".join([x.page_content for x in loader.load()])
19
+ texts.append(cleanText(text = doc))
20
+ except Exception as e:
21
+ logger.error(CustomException(e))
22
+ doc = ""
23
+ texts.append(doc)
24
+ return "\n".join(texts)
src/components/rag/RAG.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.components.vectors.vectorstore import VectorStore
2
+ from langchain_core.output_parsers import StrOutputParser
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain_core.runnables import RunnableLambda
5
+ from src.utils.exceptions import CustomException
6
+ from src.utils.functions import getConfig
7
+ from src.utils.functions import loadYaml
8
+ from src.utils.logging import logger
9
+ from langchain_groq import ChatGroq
10
+
11
+
12
+ class Chain:
13
+ def __init__(self):
14
+ self.config = getConfig(path = "config.ini")
15
+ self.store = VectorStore()
16
+ prompt = loadYaml(path = "params.yaml")["prompt"]
17
+ self.prompt = ChatPromptTemplate.from_template(prompt)
18
+
19
+ def formatDocs(self, docs):
20
+ context = ""
21
+ for doc in docs:
22
+ context += f"{doc}\n\n\n"
23
+ if context == "":
24
+ context = "No Context Found"
25
+ else:
26
+ pass
27
+ return context
28
+
29
+ def returnChain(self, text: str):
30
+ try:
31
+ logger.info("preparing chain")
32
+ store = self.store.setupStore(text = text)
33
+ chain = (
34
+ {"context": RunnableLambda(lambda x: x["question"]) | store | RunnableLambda(self.formatDocs),
35
+ "question": RunnableLambda(lambda x: x["question"])}
36
+ | self.prompt
37
+ | ChatGroq(model_name = self.config.get("LLM", "llmModel"), temperature = self.config.getfloat("LLM", "temperature"), max_tokens = self.config.getint("LLM", "maxTokens"))
38
+ | StrOutputParser()
39
+ )
40
+ return chain
41
+ except Exception as e:
42
+ logger.error(CustomException(e))
src/components/rag/__init__.py ADDED
File without changes
src/components/vectors/__init__.py ADDED
File without changes
src/components/vectors/vectorstore.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
2
+ from langchain_core.vectorstores import InMemoryVectorStore
3
+ from langchain_community.docstore.document import Document
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from src.utils.exceptions import CustomException
6
+ from src.utils.functions import getConfig
7
+ from src.utils.logging import logger
8
+
9
+ class VectorStore:
10
+ def __init__(self):
11
+ self.config = getConfig(path = "config.ini")
12
+ self.vectorEmbeddings = HuggingFaceEmbeddings(
13
+ model_name = self.config.get("EMBEDDINGS", "embeddingModel"),
14
+ model_kwargs = {"device": self.config.get("EMBEDDINGS", "device")},
15
+ encode_kwargs = {"normalize_embeddings": self.config.getboolean("EMBEDDINGS", "normalize_embeddings")}
16
+ )
17
+ self.splitter = RecursiveCharacterTextSplitter(
18
+ chunk_size = self.config.getint("VECTORSTORE", "chunkSize"),
19
+ chunk_overlap = self.config.getint("VECTORSTORE", "chunkOverlap"),
20
+ add_start_index = self.config.getboolean("VECTORSTORE", "addStartIndex")
21
+ )
22
+
23
+ def setupStore(self, text: str):
24
+ try:
25
+ store = InMemoryVectorStore(self.vectorEmbeddings)
26
+ textDocument = Document(page_content = text)
27
+ documents = self.splitter.split_documents([textDocument])
28
+ store.add_documents(documents = documents)
29
+ return store.as_retriever(
30
+ search_type = self.config.get("RETRIEVER", "searchType"),
31
+ search_kwargs = {
32
+ "k": self.config.getint("RETRIEVER", "k"),
33
+ "fetch_k": self.config.getint("RETRIEVER", "fetchK")
34
+ }
35
+ )
36
+ except Exception as e:
37
+ print(CustomException(e))
38
+ logger.error(CustomException(e))
src/pipelines/__init__.py ADDED
File without changes
src/pipelines/completePipeline.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.components.loaders.websiteCrawler import WebsiteCrawler
2
+ from src.components.loaders.youtubeLoader import YoutubeTranscriptLoader
3
+ from src.components.loaders.pdfLoader import PdfLoader
4
+ from src.components.rag.RAG import Chain
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv("secrets.env")
8
+
9
+ class Pipeline:
10
+ def __init__(self):
11
+ self.pdfLoader = PdfLoader()
12
+ self.webCrawler = WebsiteCrawler()
13
+ self.youtubeLoader = YoutubeTranscriptLoader()
14
+ self.ragChain = Chain()
15
+
16
+ def plainText(self, text: str):
17
+ chain = self.ragChain.returnChain(text = text)
18
+ return chain
19
+
20
+ def searchablePdf(self, path: str):
21
+ extractedText = self.pdfLoader.searchablePdf(pdfPath = path)
22
+ chain = self.ragChain.returnChain(text = extractedText)
23
+ return chain
24
+
25
+ def scannablePdf(self, path: str):
26
+ extractedText = self.pdfLoader.scannablePdf(pdfPath = path)
27
+ chain = self.ragChain.returnChain(text = extractedText)
28
+ return chain
29
+
30
+ def webCrawl(self, urls: list[str]):
31
+ extractedText = self.webCrawler.extractTextFromUrlList(urls = urls)
32
+ chain = self.ragChain.returnChain(text = extractedText)
33
+ return chain
34
+
35
+ def youtubeLinks(self, urls: list[str]):
36
+ extractedText = self.youtubeLoader.getTranscripts(urls = urls)
37
+ print(extractedText)
38
+ chain = self.ragChain.returnChain(text = extractedText)
39
+ return chain
src/utils/__init__.py ADDED
File without changes
src/utils/exceptions.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ def error_message_detail(error):
4
+ _, _, exc_info = sys.exc_info()
5
+ filename = exc_info.tb_frame.f_code.co_filename
6
+ lineno = exc_info.tb_lineno
7
+ error_message = "Error encountered in line no [{}], filename : [{}], saying [{}]".format(lineno, filename, error)
8
+ return error_message
9
+
10
+ class CustomException(Exception):
11
+ def __init__(self, error_message):
12
+ super().__init__(error_message)
13
+ self.error_message = error_message_detail(error_message)
14
+
15
+ def __str__(self) -> str:
16
+ return self.error_message
src/utils/functions.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import configparser
2
+ import string
3
+ import yaml
4
+
5
+ def getConfig(path: str):
6
+ config = configparser.ConfigParser()
7
+ config.read(path)
8
+ return config
9
+
10
+ def cleanText(text: str):
11
+ text = text.replace("\n", " ")
12
+ text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
13
+ return text
14
+
15
+ def loadYaml(path: str):
16
+ with open(path) as file:
17
+ return yaml.safe_load(file)
src/utils/logging.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ logger = logging.getLogger(__name__)
4
+ logger.setLevel(logging.INFO)
5
+
6
+ logFormat = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
7
+ logFormatter = logging.Formatter(fmt = logFormat, style = "%")
8
+
9
+ streamHandler = logging.StreamHandler()
10
+ streamHandler.setFormatter(logFormatter)
11
+
12
+ logger.addHandler(streamHandler)