Spaces:
Runtime error
Runtime error
LOUIS SANNA
commited on
Commit
•
bddb702
1
Parent(s):
dedb34c
clean(*): add black formatter
Browse files- app.py +10 -10
- load.py +8 -9
- poetry.lock +77 -1
- pyproject.toml +3 -0
app.py
CHANGED
@@ -3,21 +3,22 @@ from dotenv import load_dotenv
|
|
3 |
# Load environment variables from .env file
|
4 |
load_dotenv()
|
5 |
|
6 |
-
from langchain.embeddings import OpenAIEmbeddings
|
7 |
-
from langchain.vectorstores import Chroma
|
8 |
from langchain.chains import ConversationalRetrievalChain
|
9 |
-
from langchain.llms import OpenAI
|
10 |
import gradio as gr
|
11 |
-
from gradio import inputs, outputs
|
12 |
-
from gradio.mix import Parallel
|
13 |
|
14 |
max_sources = 4
|
15 |
DB_DIR = "chroma"
|
16 |
|
17 |
embedding = OpenAIEmbeddings()
|
18 |
vectordb = Chroma(persist_directory=DB_DIR, embedding_function=embedding)
|
19 |
-
pdf_qa = ConversationalRetrievalChain.from_llm(
|
20 |
-
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
def chat_pdf(query, chat_history=""):
|
@@ -36,7 +37,6 @@ def chat_pdf(query, chat_history=""):
|
|
36 |
# Pad the outputs to match the number of output components in the Gradio interface
|
37 |
padded_outputs = [answer] + cleaned_docs + [""] * (max_sources - len(cleaned_docs))
|
38 |
return padded_outputs
|
39 |
-
return [answer] + cleaned_docs
|
40 |
|
41 |
|
42 |
def create_outputs(num_sources):
|
@@ -55,8 +55,8 @@ iface = gr.Interface(
|
|
55 |
examples=[
|
56 |
["Give 2 species of fulgoroidea"],
|
57 |
["What colors are found among fulgoroidea?"],
|
58 |
-
["Why are fulgoroidea so cute?"]
|
59 |
],
|
60 |
)
|
61 |
|
62 |
-
iface.launch(debug=True)
|
|
|
3 |
# Load environment variables from .env file
|
4 |
load_dotenv()
|
5 |
|
6 |
+
from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
|
7 |
+
from langchain.vectorstores import Chroma # for the vectorization part
|
8 |
from langchain.chains import ConversationalRetrievalChain
|
9 |
+
from langchain.llms import OpenAI # the LLM model we'll use (CHatGPT)
|
10 |
import gradio as gr
|
|
|
|
|
11 |
|
12 |
max_sources = 4
|
13 |
DB_DIR = "chroma"
|
14 |
|
15 |
embedding = OpenAIEmbeddings()
|
16 |
vectordb = Chroma(persist_directory=DB_DIR, embedding_function=embedding)
|
17 |
+
pdf_qa = ConversationalRetrievalChain.from_llm(
|
18 |
+
OpenAI(temperature=0.9, model_name="gpt-3.5-turbo"),
|
19 |
+
vectordb.as_retriever(),
|
20 |
+
return_source_documents=True,
|
21 |
+
)
|
22 |
|
23 |
|
24 |
def chat_pdf(query, chat_history=""):
|
|
|
37 |
# Pad the outputs to match the number of output components in the Gradio interface
|
38 |
padded_outputs = [answer] + cleaned_docs + [""] * (max_sources - len(cleaned_docs))
|
39 |
return padded_outputs
|
|
|
40 |
|
41 |
|
42 |
def create_outputs(num_sources):
|
|
|
55 |
examples=[
|
56 |
["Give 2 species of fulgoroidea"],
|
57 |
["What colors are found among fulgoroidea?"],
|
58 |
+
["Why are fulgoroidea so cute?"],
|
59 |
],
|
60 |
)
|
61 |
|
62 |
+
iface.launch(debug=True)
|
load.py
CHANGED
@@ -3,11 +3,9 @@ from dotenv import load_dotenv
|
|
3 |
# Load environment variables from .env file
|
4 |
load_dotenv()
|
5 |
|
6 |
-
from langchain.document_loaders import UnstructuredFileLoader
|
7 |
-
from langchain.embeddings import OpenAIEmbeddings
|
8 |
-
from langchain.vectorstores import Chroma
|
9 |
-
from langchain.chains import ConversationalRetrievalChain
|
10 |
-
from langchain.llms import OpenAI # the LLM model we'll use (CHatGPT)
|
11 |
from langchain.text_splitter import CharacterTextSplitter
|
12 |
from glob import glob
|
13 |
import os
|
@@ -33,9 +31,10 @@ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
|
|
33 |
documents = text_splitter.split_documents(documents)
|
34 |
|
35 |
# Now, all_pages contains all the pages from every document
|
36 |
-
print(f
|
37 |
|
38 |
embeddings = OpenAIEmbeddings()
|
39 |
-
vectordb = Chroma.from_documents(
|
40 |
-
|
41 |
-
|
|
|
|
3 |
# Load environment variables from .env file
|
4 |
load_dotenv()
|
5 |
|
6 |
+
from langchain.document_loaders import UnstructuredFileLoader # for loading the pdf
|
7 |
+
from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
|
8 |
+
from langchain.vectorstores import Chroma # for the vectorization part
|
|
|
|
|
9 |
from langchain.text_splitter import CharacterTextSplitter
|
10 |
from glob import glob
|
11 |
import os
|
|
|
31 |
documents = text_splitter.split_documents(documents)
|
32 |
|
33 |
# Now, all_pages contains all the pages from every document
|
34 |
+
print(f"Total pages: {len(documents)}")
|
35 |
|
36 |
embeddings = OpenAIEmbeddings()
|
37 |
+
vectordb = Chroma.from_documents(
|
38 |
+
documents, embedding=embeddings, persist_directory=DB_DIR
|
39 |
+
)
|
40 |
+
vectordb.persist()
|
poetry.lock
CHANGED
@@ -235,6 +235,54 @@ files = [
|
|
235 |
{file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
|
236 |
]
|
237 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
[[package]]
|
239 |
name = "certifi"
|
240 |
version = "2023.5.7"
|
@@ -2518,6 +2566,18 @@ sql-other = ["SQLAlchemy (>=1.4.16)"]
|
|
2518 |
test = ["hypothesis (>=6.34.2)", "pytest (>=7.0.0)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"]
|
2519 |
xml = ["lxml (>=4.6.3)"]
|
2520 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2521 |
[[package]]
|
2522 |
name = "pdfminer-six"
|
2523 |
version = "20221105"
|
@@ -2619,6 +2679,22 @@ files = [
|
|
2619 |
docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"]
|
2620 |
tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
|
2621 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2622 |
[[package]]
|
2623 |
name = "posthog"
|
2624 |
version = "3.0.1"
|
@@ -4251,4 +4327,4 @@ cffi = ["cffi (>=1.11)"]
|
|
4251 |
[metadata]
|
4252 |
lock-version = "2.0"
|
4253 |
python-versions = "^3.11"
|
4254 |
-
content-hash = "
|
|
|
235 |
{file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
|
236 |
]
|
237 |
|
238 |
+
[[package]]
|
239 |
+
name = "black"
|
240 |
+
version = "23.3.0"
|
241 |
+
description = "The uncompromising code formatter."
|
242 |
+
category = "dev"
|
243 |
+
optional = false
|
244 |
+
python-versions = ">=3.7"
|
245 |
+
files = [
|
246 |
+
{file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
|
247 |
+
{file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
|
248 |
+
{file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
|
249 |
+
{file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
|
250 |
+
{file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
|
251 |
+
{file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
|
252 |
+
{file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
|
253 |
+
{file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
|
254 |
+
{file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
|
255 |
+
{file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
|
256 |
+
{file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
|
257 |
+
{file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
|
258 |
+
{file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
|
259 |
+
{file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
|
260 |
+
{file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
|
261 |
+
{file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
|
262 |
+
{file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
|
263 |
+
{file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
|
264 |
+
{file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
|
265 |
+
{file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
|
266 |
+
{file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
|
267 |
+
{file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
|
268 |
+
{file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
|
269 |
+
{file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
|
270 |
+
{file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
|
271 |
+
]
|
272 |
+
|
273 |
+
[package.dependencies]
|
274 |
+
click = ">=8.0.0"
|
275 |
+
mypy-extensions = ">=0.4.3"
|
276 |
+
packaging = ">=22.0"
|
277 |
+
pathspec = ">=0.9.0"
|
278 |
+
platformdirs = ">=2"
|
279 |
+
|
280 |
+
[package.extras]
|
281 |
+
colorama = ["colorama (>=0.4.3)"]
|
282 |
+
d = ["aiohttp (>=3.7.4)"]
|
283 |
+
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
|
284 |
+
uvloop = ["uvloop (>=0.15.2)"]
|
285 |
+
|
286 |
[[package]]
|
287 |
name = "certifi"
|
288 |
version = "2023.5.7"
|
|
|
2566 |
test = ["hypothesis (>=6.34.2)", "pytest (>=7.0.0)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"]
|
2567 |
xml = ["lxml (>=4.6.3)"]
|
2568 |
|
2569 |
+
[[package]]
|
2570 |
+
name = "pathspec"
|
2571 |
+
version = "0.11.1"
|
2572 |
+
description = "Utility library for gitignore style pattern matching of file paths."
|
2573 |
+
category = "dev"
|
2574 |
+
optional = false
|
2575 |
+
python-versions = ">=3.7"
|
2576 |
+
files = [
|
2577 |
+
{file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"},
|
2578 |
+
{file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"},
|
2579 |
+
]
|
2580 |
+
|
2581 |
[[package]]
|
2582 |
name = "pdfminer-six"
|
2583 |
version = "20221105"
|
|
|
2679 |
docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"]
|
2680 |
tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
|
2681 |
|
2682 |
+
[[package]]
|
2683 |
+
name = "platformdirs"
|
2684 |
+
version = "3.5.1"
|
2685 |
+
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
|
2686 |
+
category = "dev"
|
2687 |
+
optional = false
|
2688 |
+
python-versions = ">=3.7"
|
2689 |
+
files = [
|
2690 |
+
{file = "platformdirs-3.5.1-py3-none-any.whl", hash = "sha256:e2378146f1964972c03c085bb5662ae80b2b8c06226c54b2ff4aa9483e8a13a5"},
|
2691 |
+
{file = "platformdirs-3.5.1.tar.gz", hash = "sha256:412dae91f52a6f84830f39a8078cecd0e866cb72294a5c66808e74d5e88d251f"},
|
2692 |
+
]
|
2693 |
+
|
2694 |
+
[package.extras]
|
2695 |
+
docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.2.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"]
|
2696 |
+
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"]
|
2697 |
+
|
2698 |
[[package]]
|
2699 |
name = "posthog"
|
2700 |
version = "3.0.1"
|
|
|
4327 |
[metadata]
|
4328 |
lock-version = "2.0"
|
4329 |
python-versions = "^3.11"
|
4330 |
+
content-hash = "5efc01b2243b9e30421de14ff20d9331c548377e049d09cfa59179364f996019"
|
pyproject.toml
CHANGED
@@ -17,6 +17,9 @@ tiktoken = "^0.4.0"
|
|
17 |
pytesseract = "^0.3.10"
|
18 |
|
19 |
|
|
|
|
|
|
|
20 |
[build-system]
|
21 |
requires = ["poetry-core"]
|
22 |
build-backend = "poetry.core.masonry.api"
|
|
|
17 |
pytesseract = "^0.3.10"
|
18 |
|
19 |
|
20 |
+
[tool.poetry.group.dev.dependencies]
|
21 |
+
black = "^23.3.0"
|
22 |
+
|
23 |
[build-system]
|
24 |
requires = ["poetry-core"]
|
25 |
build-backend = "poetry.core.masonry.api"
|