Spaces:
Runtime error
Runtime error
v2.
Browse files- .gitattributes +3 -0
- .gitignore +179 -0
- __pycache__/config.cpython-311.pyc +0 -0
- __pycache__/config_key.cpython-311.pyc +0 -0
- app.py +27 -0
- config.py +19 -0
- config_key.py +1 -0
- requirements.txt +0 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-311.pyc +0 -0
- src/__pycache__/config_key.cpython-311.pyc +0 -0
- src/control/__init__.py +0 -0
- src/control/__pycache__/__init__.cpython-311.pyc +0 -0
- src/control/__pycache__/control.cpython-311.pyc +0 -0
- src/control/control.py +116 -0
- src/model/__init__.py +0 -0
- src/model/__pycache__/__init__.cpython-311.pyc +0 -0
- src/model/__pycache__/block.cpython-311.pyc +0 -0
- src/model/__pycache__/container.cpython-311.pyc +0 -0
- src/model/__pycache__/doc.cpython-311.pyc +0 -0
- src/model/__pycache__/paragraph.cpython-311.pyc +0 -0
- src/model/block.py +58 -0
- src/model/container.py +112 -0
- src/model/doc.py +79 -0
- src/model/paragraph.py +39 -0
- src/tools/__init__.py +0 -0
- src/tools/__pycache__/__init__.cpython-311.pyc +0 -0
- src/tools/__pycache__/index_creation.cpython-311.pyc +0 -0
- src/tools/__pycache__/llm.cpython-311.pyc +0 -0
- src/tools/__pycache__/reader_html.cpython-311.pyc +0 -0
- src/tools/__pycache__/reader_pdf_tools.cpython-311.pyc +0 -0
- src/tools/__pycache__/reader_word.cpython-311.pyc +0 -0
- src/tools/__pycache__/readers_pdf.cpython-311.pyc +0 -0
- src/tools/__pycache__/retriever.cpython-311.pyc +0 -0
- src/tools/__pycache__/table_converter.cpython-311.pyc +0 -0
- src/tools/index_creation.py +67 -0
- src/tools/llm.py +149 -0
- src/tools/pretty_print.py +33 -0
- src/tools/reader_html.py +118 -0
- src/tools/reader_pdf_tools.py +56 -0
- src/tools/reader_word.py +106 -0
- src/tools/readers_pdf.py +428 -0
- src/tools/retriever.py +49 -0
- src/tools/table_converter.py +14 -0
- src/view/__pycache__/view.cpython-311.pyc +0 -0
- src/view/view.py +262 -0
- styles.txt +18 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
env/lib/python3.10/site-packages/*.so filter=lfs diff=lfs merge=lfs -text
|
37 |
+
env/lib/python3.10/site-packages/*.dylib filter=lfs diff=lfs merge=lfs -text
|
38 |
+
env/lib/python3.10/site-packages/**/*.js.map filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
config_key.py
|
3 |
+
|
4 |
+
|
5 |
+
#library package
|
6 |
+
sqlite_updated/
|
7 |
+
|
8 |
+
#Test folder + files
|
9 |
+
data/Test/
|
10 |
+
test.py
|
11 |
+
test_read.py
|
12 |
+
styles.txt
|
13 |
+
|
14 |
+
#database folder
|
15 |
+
database/
|
16 |
+
database_structure/
|
17 |
+
database_word/
|
18 |
+
Ilumio_chatbot/
|
19 |
+
|
20 |
+
# Byte-compiled / optimized / DLL files
|
21 |
+
__pycache__/
|
22 |
+
*.py[cod]
|
23 |
+
*$py.class
|
24 |
+
|
25 |
+
# C extensions
|
26 |
+
*.so
|
27 |
+
|
28 |
+
# Distribution / packaging
|
29 |
+
.Python
|
30 |
+
build/
|
31 |
+
develop-eggs/
|
32 |
+
dist/
|
33 |
+
downloads/
|
34 |
+
eggs/
|
35 |
+
.eggs/
|
36 |
+
lib/
|
37 |
+
lib64/
|
38 |
+
parts/
|
39 |
+
sdist/
|
40 |
+
var/
|
41 |
+
wheels/
|
42 |
+
share/python-wheels/
|
43 |
+
*.egg-info/
|
44 |
+
.installed.cfg
|
45 |
+
*.egg
|
46 |
+
MANIFEST
|
47 |
+
|
48 |
+
# PyInstaller
|
49 |
+
# Usually these files are written by a python script from a template
|
50 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
51 |
+
*.manifest
|
52 |
+
*.spec
|
53 |
+
|
54 |
+
# Installer logs
|
55 |
+
pip-log.txt
|
56 |
+
pip-delete-this-directory.txt
|
57 |
+
|
58 |
+
# Unit test / coverage reports
|
59 |
+
htmlcov/
|
60 |
+
.tox/
|
61 |
+
.nox/
|
62 |
+
.coverage
|
63 |
+
.coverage.*
|
64 |
+
.cache
|
65 |
+
nosetests.xml
|
66 |
+
coverage.xml
|
67 |
+
*.cover
|
68 |
+
*.py,cover
|
69 |
+
.hypothesis/
|
70 |
+
.pytest_cache/
|
71 |
+
cover/
|
72 |
+
|
73 |
+
# Translations
|
74 |
+
*.mo
|
75 |
+
*.pot
|
76 |
+
|
77 |
+
# Django stuff:
|
78 |
+
*.log
|
79 |
+
local_settings.py
|
80 |
+
db.sqlite3
|
81 |
+
db.sqlite3-journal
|
82 |
+
|
83 |
+
# Flask stuff:
|
84 |
+
instance/
|
85 |
+
.webassets-cache
|
86 |
+
|
87 |
+
# Scrapy stuff:
|
88 |
+
.scrapy
|
89 |
+
.env
|
90 |
+
# Sphinx documentation
|
91 |
+
docs/_build/
|
92 |
+
|
93 |
+
# PyBuilder
|
94 |
+
.pybuilder/
|
95 |
+
target/
|
96 |
+
|
97 |
+
# Jupyter Notebook
|
98 |
+
.ipynb_checkpoints
|
99 |
+
|
100 |
+
# IPython
|
101 |
+
profile_default/
|
102 |
+
ipython_config.py
|
103 |
+
|
104 |
+
# pyenv
|
105 |
+
# For a library or package, you might want to ignore these files since the code is
|
106 |
+
# intended to run in multiple environments; otherwise, check them in:
|
107 |
+
# .python-version
|
108 |
+
|
109 |
+
# pipenv
|
110 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
111 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
112 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
113 |
+
# install all needed dependencies.
|
114 |
+
#Pipfile.lock
|
115 |
+
|
116 |
+
# poetry
|
117 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
118 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
119 |
+
# commonly ignored for libraries.
|
120 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
121 |
+
#poetry.lock
|
122 |
+
|
123 |
+
# pdm
|
124 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
125 |
+
#pdm.lock
|
126 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
127 |
+
# in version control.
|
128 |
+
# https://pdm.fming.dev/#use-with-ide
|
129 |
+
.pdm.toml
|
130 |
+
|
131 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
132 |
+
__pypackages__/
|
133 |
+
|
134 |
+
# Celery stuff
|
135 |
+
celerybeat-schedule
|
136 |
+
celerybeat.pid
|
137 |
+
|
138 |
+
# SageMath parsed files
|
139 |
+
*.sage.py
|
140 |
+
|
141 |
+
# Environments
|
142 |
+
.env
|
143 |
+
.venv
|
144 |
+
env/
|
145 |
+
venv/
|
146 |
+
ENV/
|
147 |
+
env.bak/
|
148 |
+
venv.bak/
|
149 |
+
|
150 |
+
# Spyder project settings
|
151 |
+
.spyderproject
|
152 |
+
.spyproject
|
153 |
+
|
154 |
+
# Rope project settings
|
155 |
+
.ropeproject
|
156 |
+
|
157 |
+
# mkdocs documentation
|
158 |
+
/site
|
159 |
+
|
160 |
+
# mypy
|
161 |
+
.mypy_cache/
|
162 |
+
.dmypy.json
|
163 |
+
dmypy.json
|
164 |
+
|
165 |
+
# Pyre type checker
|
166 |
+
.pyre/
|
167 |
+
|
168 |
+
# pytype static type analyzer
|
169 |
+
.pytype/
|
170 |
+
|
171 |
+
# Cython debug symbols
|
172 |
+
cython_debug/
|
173 |
+
|
174 |
+
# PyCharm
|
175 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
176 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
177 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
178 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
179 |
+
#.idea/
|
__pycache__/config.cpython-311.pyc
ADDED
Binary file (1.41 kB). View file
|
|
__pycache__/config_key.cpython-311.pyc
ADDED
Binary file (239 Bytes). View file
|
|
app.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from config import *
|
3 |
+
from src.tools.llm import LlmAgent
|
4 |
+
import src.view.view as view
|
5 |
+
from src.control.control import Chatbot
|
6 |
+
import chromadb
|
7 |
+
from src.tools.retriever import Retriever
|
8 |
+
|
9 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
10 |
+
|
11 |
+
if not "OPENAI_API_KEY" in os.environ:
|
12 |
+
from config_key import OPENAI_API_KEY
|
13 |
+
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
|
14 |
+
|
15 |
+
llm_model = "gpt-4"
|
16 |
+
llm = LlmAgent(llm_model=llm_model)
|
17 |
+
|
18 |
+
if not os.path.exists("database_structure/"):
|
19 |
+
os.makedirs("database_structure/")
|
20 |
+
|
21 |
+
client_db = chromadb.PersistentClient("database_structure/")
|
22 |
+
|
23 |
+
chat = Chatbot(client_db=client_db,retriever=Retriever(llmagent=llm),llm_agent=llm)
|
24 |
+
|
25 |
+
ilumio_qna = view.run(ctrl=chat, config=view_config)
|
26 |
+
|
27 |
+
ilumio_qna.queue().launch()
|
config.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
content_language = 'en'
|
2 |
+
plan_language = 'en'
|
3 |
+
content_en_path_real = "data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
|
4 |
+
content_test = "data/Test/Illumio_product_brief.pdf"
|
5 |
+
content_python = "data/cours-python_crop.docx"
|
6 |
+
content_html = "data/Test/list.html"
|
7 |
+
content_data_analyst = "data\Test\Data_Analyst_chez_Stockly.pdf"
|
8 |
+
content_test_epita = "data\Test\Test_epita.pdf"
|
9 |
+
|
10 |
+
examples = {"Question_1": "What is the max_results parameter for async traffic queries ?",
|
11 |
+
"Question_2": "How can I use the Public Experimental Provisioning API to determine if a specific set of objects can be provisioned?",
|
12 |
+
"Question_3": "Explain the potential challenges and workarounds when using json-query with the curl -i option. Why might this combination lead to errors?",
|
13 |
+
}
|
14 |
+
|
15 |
+
|
16 |
+
view_config = {
|
17 |
+
'title': "<h1 style=text-align:center;font-size:4.5em;background-image:linear-gradient(45deg,#f3ec78,#af4261);background-color:red;background-size:100%;background-repeat:repeat;-webkit-background-clip:text;-webkit-text-fill-color:transparent;-moz-background-clip:text;-moz-text-fill-color:transparent;font-weight:bold;margin-top:4%;padding-bottom:1%>Document QnA</h1>",
|
18 |
+
'examples': examples,
|
19 |
+
}
|
config_key.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
OPENAI_API_KEY = "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"
|
requirements.txt
ADDED
Binary file (5.51 kB). View file
|
|
src/__init__.py
ADDED
File without changes
|
src/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (172 Bytes). View file
|
|
src/__pycache__/config_key.cpython-311.pyc
ADDED
Binary file (269 Bytes). View file
|
|
src/control/__init__.py
ADDED
File without changes
|
src/control/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (180 Bytes). View file
|
|
src/control/__pycache__/control.cpython-311.pyc
ADDED
Binary file (7.87 kB). View file
|
|
src/control/control.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import chromadb
|
3 |
+
from src.tools.retriever import Retriever
|
4 |
+
from src.tools.llm import LlmAgent
|
5 |
+
from src.model.block import Block
|
6 |
+
from src.model.doc import Doc
|
7 |
+
from chromadb.utils import embedding_functions
|
8 |
+
import gradio as gr
|
9 |
+
|
10 |
+
|
11 |
+
class Chatbot:
|
12 |
+
def __init__(self, llm_agent : LlmAgent = None, retriever: Retriever = None, client_db=None):
|
13 |
+
self.retriever = retriever
|
14 |
+
self.llm = llm_agent
|
15 |
+
self.client_db = client_db
|
16 |
+
|
17 |
+
def get_response(self, query, histo):
|
18 |
+
histo_conversation, histo_queries = self._get_histo(histo)
|
19 |
+
language_of_query = self.llm.detect_language_v2(query).lower()
|
20 |
+
queries = self.llm.translate_v2(histo_queries)
|
21 |
+
if "en" in language_of_query:
|
22 |
+
language_of_query = "en"
|
23 |
+
else:
|
24 |
+
language_of_query = "fr"
|
25 |
+
block_sources = self.retriever.similarity_search(queries=queries)
|
26 |
+
block_sources = self._select_best_sources(block_sources)
|
27 |
+
sources_contents = [f"Paragraph title : {s.title}\n-----\n{s.content}" if s.title else f"Paragraph {s.index}\n-----\n{s.content}" for s in block_sources]
|
28 |
+
context = '\n'.join(sources_contents)
|
29 |
+
i = 1
|
30 |
+
while (len(context) + len(histo_conversation) > 15000) and i < len(sources_contents):
|
31 |
+
context = "\n".join(sources_contents[:-i])
|
32 |
+
i += 1
|
33 |
+
answer = self.llm.generate_paragraph_v2(query=query, histo=histo_conversation, context=context, language=language_of_query)
|
34 |
+
answer = self._clean_chatgpt_answer(answer)
|
35 |
+
return answer, block_sources
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
@staticmethod
|
40 |
+
def _select_best_sources(sources: [Block], delta_1_2=0.15, delta_1_n=0.3, absolute=1.2, alpha=0.9) -> [Block]:
|
41 |
+
"""
|
42 |
+
Select the best sources: not far from the very best, not far from the last selected, and not too bad per se
|
43 |
+
"""
|
44 |
+
best_sources = []
|
45 |
+
for idx, s in enumerate(sources):
|
46 |
+
if idx == 0 \
|
47 |
+
or (s.distance - sources[idx - 1].distance < delta_1_2
|
48 |
+
and s.distance - sources[0].distance < delta_1_n) \
|
49 |
+
or s.distance < absolute:
|
50 |
+
best_sources.append(s)
|
51 |
+
delta_1_2 *= alpha
|
52 |
+
delta_1_n *= alpha
|
53 |
+
absolute *= alpha
|
54 |
+
else:
|
55 |
+
break
|
56 |
+
return best_sources
|
57 |
+
|
58 |
+
|
59 |
+
@staticmethod
|
60 |
+
def _get_histo(histo: [(str, str)]) -> (str, str):
|
61 |
+
histo_conversation = ""
|
62 |
+
histo_queries = ""
|
63 |
+
|
64 |
+
for (query, answer) in histo[-5:]:
|
65 |
+
histo_conversation += f'user: {query} \n bot: {answer}\n'
|
66 |
+
histo_queries += query + '\n'
|
67 |
+
return histo_conversation[:-1], histo_queries
|
68 |
+
|
69 |
+
|
70 |
+
@staticmethod
|
71 |
+
def _clean_answer(answer: str) -> str:
|
72 |
+
print(answer)
|
73 |
+
answer = answer.strip('bot:')
|
74 |
+
while answer and answer[-1] in {"'", '"', " ", "`"}:
|
75 |
+
answer = answer[:-1]
|
76 |
+
while answer and answer[0] in {"'", '"', " ", "`"}:
|
77 |
+
answer = answer[1:]
|
78 |
+
answer = answer.strip('bot:')
|
79 |
+
if answer:
|
80 |
+
if answer[-1] != ".":
|
81 |
+
answer += "."
|
82 |
+
return answer
|
83 |
+
|
84 |
+
def _clean_chatgpt_answer(self,answer: str) -> str:
|
85 |
+
answer = answer.strip('bot:')
|
86 |
+
answer = answer.strip('Answer:')
|
87 |
+
answer = answer.strip('Réponse:')
|
88 |
+
while answer and answer[-1] in {"'", '"', " ", "`"}:
|
89 |
+
answer = answer[:-1]
|
90 |
+
return answer
|
91 |
+
|
92 |
+
def upload_doc(self,input_doc,include_images_,actual_page_start):
|
93 |
+
title = Doc.get_title(Doc,input_doc.name)
|
94 |
+
extension = title.split('.')[-1]
|
95 |
+
if extension and (extension == 'docx' or extension == 'pdf' or extension == 'html'):
|
96 |
+
open_ai_embedding = embedding_functions.OpenAIEmbeddingFunction(api_key=os.environ['OPENAI_API_KEY'], model_name="text-embedding-ada-002")
|
97 |
+
coll_name = "".join([c if c.isalnum() else "_" for c in title])
|
98 |
+
collection = self.client_db.get_or_create_collection(name=coll_name,embedding_function=open_ai_embedding)
|
99 |
+
|
100 |
+
if collection.count() == 0:
|
101 |
+
gr.Info("Please wait while your document is being analysed")
|
102 |
+
print("Database is empty")
|
103 |
+
doc = Doc(path=input_doc.name,include_images=include_images_,actual_first_page=actual_page_start)
|
104 |
+
|
105 |
+
# for block in doc.blocks: #DEBUG PART
|
106 |
+
# print(f"{block.index} : {block.content}")
|
107 |
+
|
108 |
+
retriever = Retriever(doc.container, collection=collection,llmagent=self.llm)
|
109 |
+
else:
|
110 |
+
print("Database is not empty")
|
111 |
+
retriever = Retriever(collection=collection,llmagent=self.llm)
|
112 |
+
|
113 |
+
self.retriever = retriever
|
114 |
+
else:
|
115 |
+
return False
|
116 |
+
return True
|
src/model/__init__.py
ADDED
File without changes
|
src/model/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (178 Bytes). View file
|
|
src/model/__pycache__/block.cpython-311.pyc
ADDED
Binary file (3.04 kB). View file
|
|
src/model/__pycache__/container.cpython-311.pyc
ADDED
Binary file (5.77 kB). View file
|
|
src/model/__pycache__/doc.cpython-311.pyc
ADDED
Binary file (4.05 kB). View file
|
|
src/model/__pycache__/paragraph.cpython-311.pyc
ADDED
Binary file (2.64 kB). View file
|
|
src/model/block.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
class Block:
|
4 |
+
def __init__(self, doc: str= '',title: str = '', content: str = '',
|
5 |
+
index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
|
6 |
+
self.doc = doc
|
7 |
+
self.title = title
|
8 |
+
self.content = content
|
9 |
+
self.index = index
|
10 |
+
self.rank = rank
|
11 |
+
self.level = level
|
12 |
+
self.distance = distance
|
13 |
+
|
14 |
+
@property
|
15 |
+
def distance_str(self) -> str:
|
16 |
+
return format(self.distance, '.2f')
|
17 |
+
|
18 |
+
def separate_1_block_in_n(self, max_size=4500):
|
19 |
+
"""
|
20 |
+
Separate a block in n blocks of equal size
|
21 |
+
"""
|
22 |
+
content_length = len(self.content)
|
23 |
+
n = math.ceil(content_length / max_size)
|
24 |
+
block_size = content_length // n
|
25 |
+
new_blocks = []
|
26 |
+
for i in range(n):
|
27 |
+
start = i * block_size
|
28 |
+
end = (i + 1) * block_size if i < n - 1 else None
|
29 |
+
new_blocks.append(Block(doc=self.doc,
|
30 |
+
title=self.title + f"_part{i}",
|
31 |
+
content=self.content[start:end],
|
32 |
+
index=self.index + f"_{i}",
|
33 |
+
rank=self.rank,
|
34 |
+
level=self.level))
|
35 |
+
return new_blocks
|
36 |
+
|
37 |
+
|
38 |
+
def to_dict(self) -> {}:
|
39 |
+
block_dict = {'doc': self.doc,
|
40 |
+
'title': self.title,
|
41 |
+
'content': self.content,
|
42 |
+
'index': self.index,
|
43 |
+
'rank': self.rank,
|
44 |
+
'level': self.level,
|
45 |
+
'distance': self.distance}
|
46 |
+
return block_dict
|
47 |
+
|
48 |
+
def from_dict(self, block_dict: {}):
|
49 |
+
self.doc = block_dict['doc']
|
50 |
+
self.title = block_dict['title']
|
51 |
+
self.content = block_dict['content']
|
52 |
+
self.index = block_dict['index']
|
53 |
+
self.rank = block_dict['rank']
|
54 |
+
self.level = block_dict['level']
|
55 |
+
self.distance = block_dict['distance']
|
56 |
+
return self
|
57 |
+
|
58 |
+
|
src/model/container.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .paragraph import Paragraph
|
2 |
+
from .block import Block
|
3 |
+
|
4 |
+
INFINITE = 99999
|
5 |
+
|
6 |
+
class Container:
|
7 |
+
|
8 |
+
def __init__(self, paragraphs : [Paragraph], title : Paragraph=None, level: int = 0, index: [int] = None , father=None, id_ = 0):
|
9 |
+
if index is None:
|
10 |
+
index = []
|
11 |
+
self.level = level
|
12 |
+
self.title = title
|
13 |
+
self.paragraphs = []
|
14 |
+
self.children = []
|
15 |
+
self.index = index
|
16 |
+
self.father = father
|
17 |
+
self.id_ = int(str(1) + str(father.id_) + str(id_))
|
18 |
+
if paragraphs:
|
19 |
+
self.paragraphs, self.children = self.create_children(paragraphs, level, index)
|
20 |
+
self.containers = [self]
|
21 |
+
for child in self.children:
|
22 |
+
self.containers += child.containers
|
23 |
+
self.blocks = self.get_blocks()
|
24 |
+
|
25 |
+
|
26 |
+
def get_blocks(self):
|
27 |
+
block = Block(level=self.level, index=self.index)
|
28 |
+
if self.title:
|
29 |
+
self.title.text = self.title.text.replace('\r', '').replace('\n', '')
|
30 |
+
block.title = self.title.text
|
31 |
+
block.content = self.title.text + '/'
|
32 |
+
temp_father = self.father
|
33 |
+
while temp_father and type(temp_father) == Container:
|
34 |
+
if temp_father.title:
|
35 |
+
temp_father.title.text = temp_father.title.text.replace('\r', '').replace('\n', '')
|
36 |
+
block.content = temp_father.title.text + '/' + block.content
|
37 |
+
temp_father = temp_father.father
|
38 |
+
block.content += " :\n\n"
|
39 |
+
i = 0
|
40 |
+
for p in self.paragraphs:
|
41 |
+
if not p.blank:
|
42 |
+
i = 1
|
43 |
+
block.content += p.text
|
44 |
+
if i == 0:
|
45 |
+
blocks = []
|
46 |
+
else:
|
47 |
+
blocks = [block]
|
48 |
+
for child in self.children:
|
49 |
+
blocks += child.blocks
|
50 |
+
return blocks
|
51 |
+
|
52 |
+
|
53 |
+
def create_children(self, paragraphs, level, rank) -> ([], []):
|
54 |
+
"""
|
55 |
+
creates children containers or directly attached content
|
56 |
+
and returns the list of containers and contents of level+1
|
57 |
+
:return:
|
58 |
+
[Content or Container]
|
59 |
+
"""
|
60 |
+
attached_paragraphs = []
|
61 |
+
container_paragraphs = []
|
62 |
+
container_title = None
|
63 |
+
children = []
|
64 |
+
in_children = False
|
65 |
+
level = INFINITE
|
66 |
+
child_id = 0
|
67 |
+
|
68 |
+
while paragraphs:
|
69 |
+
p = paragraphs.pop(0)
|
70 |
+
if not in_children and not p.is_structure:
|
71 |
+
attached_paragraphs.append(p)
|
72 |
+
else:
|
73 |
+
in_children = True
|
74 |
+
if p.blank:
|
75 |
+
continue
|
76 |
+
if p.is_structure and p.level <= level: # if p is higher or equal in hierarchy
|
77 |
+
if container_paragraphs or container_title:
|
78 |
+
children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
|
79 |
+
child_id += 1
|
80 |
+
container_paragraphs = []
|
81 |
+
container_title = p
|
82 |
+
level = p.level
|
83 |
+
|
84 |
+
else: # p is strictly lower in hierarchy
|
85 |
+
container_paragraphs.append(p)
|
86 |
+
|
87 |
+
if container_paragraphs or container_title:
|
88 |
+
children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
|
89 |
+
child_id += 1
|
90 |
+
|
91 |
+
return attached_paragraphs, children
|
92 |
+
|
93 |
+
|
94 |
+
@property
|
95 |
+
def structure(self):
|
96 |
+
|
97 |
+
self_structure = {str(self.id_): {
|
98 |
+
'index': str(self.id_),
|
99 |
+
'canMove': True,
|
100 |
+
'isFolder': True,
|
101 |
+
'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children],
|
102 |
+
'canRename': True,
|
103 |
+
'data': {},
|
104 |
+
'level': self.level,
|
105 |
+
'rank': self.rank,
|
106 |
+
'title': self.title.text if self.title else 'root'
|
107 |
+
}}
|
108 |
+
paragraphs_structure = [p.structure for p in self.paragraphs]
|
109 |
+
structure = [self_structure] + paragraphs_structure
|
110 |
+
for child in self.children:
|
111 |
+
structure += child.structure
|
112 |
+
return structure
|
src/model/doc.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.model.container import Container
|
2 |
+
from src.tools.index_creation import set_indexes
|
3 |
+
from src.tools.reader_word import WordReader
|
4 |
+
from src.tools.readers_pdf import Reader, Reader_illumio
|
5 |
+
from src.tools.reader_html import Reader_HTML
|
6 |
+
from src.model.paragraph import Paragraph
|
7 |
+
|
8 |
+
|
9 |
+
class Doc:
|
10 |
+
|
11 |
+
def __init__(self, path='', include_images=True, actual_first_page=1):
|
12 |
+
|
13 |
+
self.title = self.get_title(path)
|
14 |
+
self.extension = self.title.split('.')[-1]
|
15 |
+
self.id_ = id(self)
|
16 |
+
self.path = path
|
17 |
+
paragraphs = []
|
18 |
+
if self.extension == 'docx':
|
19 |
+
paragraphs = WordReader(path).paragraphs
|
20 |
+
elif self.extension == 'pdf':
|
21 |
+
if "Illumio_Core_REST_API_Developer_Guide_23.3" in self.title:
|
22 |
+
paragraphs = Reader_illumio(path).paragraphs
|
23 |
+
else:
|
24 |
+
paragraphs = Reader(path, actual_first_page, include_images).paragraphs
|
25 |
+
else:
|
26 |
+
paragraphs = Reader_HTML(path).paragraphs
|
27 |
+
self.container = Container(paragraphs, father=self, title=self.set_first_container_title(self.title.split(".")[0],self.extension))
|
28 |
+
set_indexes(self.container)
|
29 |
+
self.blocks = self.get_blocks()
|
30 |
+
|
31 |
+
|
32 |
+
def get_title(self,path) -> str:
|
33 |
+
if '/' not in path and '\\' not in path:
|
34 |
+
res = path
|
35 |
+
if '/' in path:
|
36 |
+
res = path.split('/')[-1]
|
37 |
+
if '\\' in path:
|
38 |
+
res = path.split('\\')[-1]
|
39 |
+
return res
|
40 |
+
|
41 |
+
@property
|
42 |
+
def structure(self):
|
43 |
+
return self.container.structure
|
44 |
+
|
45 |
+
def get_blocks(self):
|
46 |
+
|
47 |
+
def from_list_to_str(index_list):
|
48 |
+
index_str = str(index_list[0])
|
49 |
+
for el in index_list[1:]:
|
50 |
+
index_str += '.' + str(el)
|
51 |
+
return index_str
|
52 |
+
|
53 |
+
blocks = self.container.blocks
|
54 |
+
for block in blocks:
|
55 |
+
block.doc = self.title
|
56 |
+
block.index = from_list_to_str(block.index)
|
57 |
+
return blocks
|
58 |
+
|
59 |
+
def set_first_container_title(self,title,extension) -> Paragraph:
|
60 |
+
if extension == 'pdf':
|
61 |
+
return Paragraph(text=title,font_style='title0',id_=0,page_id=0)
|
62 |
+
elif extension == 'docx':
|
63 |
+
return Paragraph(text=title,font_style='title0',id_=0,page_id=1)
|
64 |
+
else:
|
65 |
+
return Paragraph(text=title,font_style='h0',id_=0,page_id=1)
|
66 |
+
"""
|
67 |
+
current_level = len(current_index)
|
68 |
+
if 0 < block.level:
|
69 |
+
if block.level == current_level:
|
70 |
+
current_index[-1] += 1
|
71 |
+
elif current_level < block.level:
|
72 |
+
current_index.append(1)
|
73 |
+
elif block.level < current_level:
|
74 |
+
current_index = current_index[:block.level]
|
75 |
+
current_index[-1] += 1
|
76 |
+
block.index = from_list_to_str(current_index)
|
77 |
+
else:
|
78 |
+
block.index = "0"
|
79 |
+
"""
|
src/model/paragraph.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
|
3 |
+
INFINITE = 10000
|
4 |
+
|
5 |
+
class Paragraph:
|
6 |
+
def __init__(self, text : str, font_style : str, id_ : int, page_id : int):
|
7 |
+
self.font_style = font_style
|
8 |
+
self.id_ = int(str(2)+str(page_id)+str(id_))
|
9 |
+
self.page_id = page_id
|
10 |
+
self.level = self.handle_levels(font_style)
|
11 |
+
self.is_structure = self.level < INFINITE
|
12 |
+
self.text = text
|
13 |
+
|
14 |
+
@property
|
15 |
+
def blank(self):
|
16 |
+
"""
|
17 |
+
checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored)
|
18 |
+
"""
|
19 |
+
text = self.text.replace('\n', '')
|
20 |
+
return set(text).isdisjoint(string.ascii_letters)
|
21 |
+
|
22 |
+
def rearrange_paragraph(self):
|
23 |
+
"""
|
24 |
+
rearrange the paragraph to have a better structure
|
25 |
+
"""
|
26 |
+
if self.font_style == "code":
|
27 |
+
self.text = "\n\nCode :```\n" + self.text + "\n```\n\n"
|
28 |
+
elif self.font_style == "table":
|
29 |
+
self.text = "\n\nTable :\n" + self.text + "\n\n"
|
30 |
+
return self
|
31 |
+
|
32 |
+
def handle_levels(self, font_style : str):
|
33 |
+
if len(font_style) != 5 and 'title' in font_style:
|
34 |
+
return int(font_style[-1])
|
35 |
+
elif len(font_style) == 2 and font_style[0] == 'h':
|
36 |
+
return int(font_style[-1])
|
37 |
+
else:
|
38 |
+
return INFINITE
|
39 |
+
|
src/tools/__init__.py
ADDED
File without changes
|
src/tools/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (178 Bytes). View file
|
|
src/tools/__pycache__/index_creation.cpython-311.pyc
ADDED
Binary file (4.83 kB). View file
|
|
src/tools/__pycache__/llm.cpython-311.pyc
ADDED
Binary file (11.7 kB). View file
|
|
src/tools/__pycache__/reader_html.cpython-311.pyc
ADDED
Binary file (8.28 kB). View file
|
|
src/tools/__pycache__/reader_pdf_tools.cpython-311.pyc
ADDED
Binary file (3.64 kB). View file
|
|
src/tools/__pycache__/reader_word.cpython-311.pyc
ADDED
Binary file (4.72 kB). View file
|
|
src/tools/__pycache__/readers_pdf.cpython-311.pyc
ADDED
Binary file (25.1 kB). View file
|
|
src/tools/__pycache__/retriever.cpython-311.pyc
ADDED
Binary file (3.24 kB). View file
|
|
src/tools/__pycache__/table_converter.cpython-311.pyc
ADDED
Binary file (1.03 kB). View file
|
|
src/tools/index_creation.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.model.container import Container
|
2 |
+
|
3 |
+
INFINITE = 99999
|
4 |
+
|
5 |
+
def create_dic_levels(c:Container,dict_of_levels : dict = {}):
|
6 |
+
if c.level == 0:
|
7 |
+
dict_of_levels[c.level] = [0]
|
8 |
+
for child in c.children:
|
9 |
+
if child.level not in dict_of_levels:
|
10 |
+
dict_of_levels[child.level] = [1 for _ in range(child.level)]
|
11 |
+
create_dic_levels(child, dict_of_levels)
|
12 |
+
if INFINITE in dict_of_levels.keys():
|
13 |
+
dict_of_levels[INFINITE] = [1]
|
14 |
+
return dict_of_levels
|
15 |
+
|
16 |
+
|
17 |
+
def create_good_indexes(c:Container, dict_of_levels : dict):
|
18 |
+
actual_level = c.level
|
19 |
+
c.index = dict_of_levels[actual_level].copy()
|
20 |
+
actual_len = len(dict_of_levels[actual_level])
|
21 |
+
temp_update = dict_of_levels[actual_level][-1]
|
22 |
+
dict_of_levels[actual_level][-1] += 1
|
23 |
+
for i in dict_of_levels.values():
|
24 |
+
if len(i) > actual_len:
|
25 |
+
i[actual_len - 1] = temp_update
|
26 |
+
for child in c.children:
|
27 |
+
c_lvl = child.level
|
28 |
+
for i in dict_of_levels.values():
|
29 |
+
if len(i) > c_lvl:
|
30 |
+
i[c_lvl:] = [1 for _ in range(len(i[c_lvl:]))]
|
31 |
+
create_good_indexes(child, dict_of_levels) # Apply the function recursively to all children
|
32 |
+
|
33 |
+
|
34 |
+
def create_good_indexes_not_ordered_titles(c:Container, dict_of_levels : dict):
|
35 |
+
actual_level = c.level
|
36 |
+
c.index = dict_of_levels[actual_level].copy()
|
37 |
+
actual_len = len(dict_of_levels[actual_level])
|
38 |
+
temp_update = dict_of_levels[actual_level][-1]
|
39 |
+
dict_of_levels[actual_level][-1] += 1
|
40 |
+
for i in dict_of_levels.values():
|
41 |
+
if len(i) > actual_len:
|
42 |
+
i[actual_len - 1] = temp_update
|
43 |
+
for child in c.children:
|
44 |
+
c_lvl = child.level
|
45 |
+
for i in dict_of_levels.values():
|
46 |
+
if len(i) > c_lvl:
|
47 |
+
i[c_lvl:] = [1 for _ in range(len(i[c_lvl:]))]
|
48 |
+
create_good_indexes(child, dict_of_levels) # Apply the function recursively to all children
|
49 |
+
|
50 |
+
|
51 |
+
def set_good_block_indexes(c:Container):
|
52 |
+
for i in c.containers:
|
53 |
+
for b in i.blocks:
|
54 |
+
b.index = i.index
|
55 |
+
|
56 |
+
|
57 |
+
def set_indexes(c:Container):
|
58 |
+
dict_levels = create_dic_levels(c)
|
59 |
+
myKeys = list(dict_levels.keys())
|
60 |
+
myKeys.sort()
|
61 |
+
dict_levels = {key: dict_levels[key] for key in myKeys}
|
62 |
+
if c.children and c.children[0] and (c.children[0].level > min(list(dict_levels.keys())[1:])):
|
63 |
+
c.children[0].level = min(list(dict_levels.keys())[1:])
|
64 |
+
create_good_indexes_not_ordered_titles(c, dict_levels)
|
65 |
+
else:
|
66 |
+
create_good_indexes(c, dict_levels)
|
67 |
+
set_good_block_indexes(c)
|
src/tools/llm.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
|
3 |
+
class LlmAgent:
|
4 |
+
|
5 |
+
def __init__(self, llm_model: str):
|
6 |
+
self.llm = llm_model
|
7 |
+
|
8 |
+
def generate_paragraph(self, query: str, context: {}, histo: [(str, str)], language='fr') -> str:
|
9 |
+
"""generates the answer"""
|
10 |
+
template = (f"You are a conversation bot designed to answer to the query from users."
|
11 |
+
f"Your answer is based on the context delimited by triple backticks :\n ``` {context} ```\n"
|
12 |
+
f"You are consistent and avoid redundancies with the rest of the initial conversation delimited by triple backticks :\n ``` {histo} ```\n"
|
13 |
+
f"Your response shall be in {language} and shall be concise."
|
14 |
+
f"You shall only provide the answer, nothing else before and after."
|
15 |
+
f"Here is the query you are given :\n"
|
16 |
+
f"``` {query} ```")
|
17 |
+
generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
|
18 |
+
res = generation.choices[0].message.content
|
19 |
+
print("****************")
|
20 |
+
print(res)
|
21 |
+
print("----")
|
22 |
+
return str(res)
|
23 |
+
|
24 |
+
def generate_paragraph_v2(self, query: str, context: {}, histo: [(str, str)], language='fr') -> str:
|
25 |
+
"""generates the answer"""
|
26 |
+
context_for_the_ai = (f"You are a conversation bot designed to answer to the query from users."
|
27 |
+
f"Your answer is based on the context delimited by triple backticks :\n ``` {context} ```\n"
|
28 |
+
f"You are consistent and avoid redundancies with the rest of the initial conversation delimited by triple backticks :\n ``` {histo} ```\n"
|
29 |
+
f"Your response shall be in {language} and shall be concise.")
|
30 |
+
generation = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k", messages=[{"role":"system","content":context_for_the_ai},{"role":"user","content":query}])
|
31 |
+
res = generation.choices[0].message.content
|
32 |
+
print("****************")
|
33 |
+
print(res)
|
34 |
+
print("----")
|
35 |
+
return str(res)
|
36 |
+
|
37 |
+
def translate(self, text: str) -> str:
|
38 |
+
"""translates"""
|
39 |
+
template = (f"Your task consists in translating in English the following text delimited by triple backticks: ``` {text} ```\n"
|
40 |
+
f"If the text is already in English, just return it !\n"
|
41 |
+
f"Your must not provide an answer to the text, just translate it.\n")
|
42 |
+
generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
|
43 |
+
res = generation.choices[0].message.content
|
44 |
+
print("****************")
|
45 |
+
print(res)
|
46 |
+
print("----TRANSLATE----")
|
47 |
+
return res
|
48 |
+
|
49 |
+
def translate_v2(self, text: str) -> str:
|
50 |
+
"""translates"""
|
51 |
+
task = "Translate in english the text. If it is already in english, just return the text."
|
52 |
+
generation = openai.ChatCompletion.create(model="gpt-4", messages=[{"role":"system","content":task},{"role":"user","content":text}])
|
53 |
+
res = generation.choices[0].message.content
|
54 |
+
print("****************")
|
55 |
+
print(res)
|
56 |
+
print("----TRANSLATE V2----")
|
57 |
+
return res
|
58 |
+
|
59 |
+
def generate_answer(self, query: str, answer: str, histo: str, context: str,language : str) -> str:
|
60 |
+
"""provides the final answer in {language} based on the initial query and the answer in english"""
|
61 |
+
template = (f"Your task consists in translating the answer in {language}, if its not already the case, to the query "
|
62 |
+
f"delimited by triple backticks: ```{query}``` \n"
|
63 |
+
f"You don't add new content to the answer but: "
|
64 |
+
f"1 You can use some vocabulary from the context delimited by triple backticks:\n"
|
65 |
+
f"```{context}```\n"
|
66 |
+
f"2 You are consistent and avoid redundancies with the rest of the initial"
|
67 |
+
f"conversation delimited by triple backticks: ```{histo}```\n"
|
68 |
+
f"Your response shall respect the following format:<response>\n"
|
69 |
+
f"Here is the answer you are given in {language}:"
|
70 |
+
f"{answer}")
|
71 |
+
generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
|
72 |
+
res = generation.choices[0].message.content
|
73 |
+
print("****************")
|
74 |
+
print(res)
|
75 |
+
print("----")
|
76 |
+
return str(res).strip()
|
77 |
+
|
78 |
+
def summarize_paragraph(self, prompt : str, title_doc : str = '',title_para : str = ''):
|
79 |
+
max_tokens = 700
|
80 |
+
"""summarizes the paragraph"""
|
81 |
+
template = (f"Your task consists in summarizing the paragraph of the document untitled ```{title_doc}```."
|
82 |
+
f"The paragraph title is ```{title_para}```."
|
83 |
+
f"Your response shall be concise and shall respect the following format:"
|
84 |
+
f"<summary>"
|
85 |
+
f"If you see that the summary that you are creating will not respect ```{max_tokens}``` tokens, find a way to make it shorter."
|
86 |
+
f"The paragraph you need to summarize is the following :"
|
87 |
+
f"{prompt}")
|
88 |
+
generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
|
89 |
+
res = generation.choices[0].message.content
|
90 |
+
print("****************")
|
91 |
+
print(res)
|
92 |
+
print("----")
|
93 |
+
return str(res).strip()
|
94 |
+
|
95 |
+
def summarize_paragraph_v2(self, prompt : str, title_doc : str = '', title_para : str = ''):
|
96 |
+
max_tokens = 850
|
97 |
+
location_of_the_paragraph = prompt.split(" :")[0]
|
98 |
+
"""summarizes the paragraph"""
|
99 |
+
task = (f"Your task consists in summarizing in English the paragraph of the document untitled ```{title_doc}``` located in the ```{location_of_the_paragraph}``` section of the document."
|
100 |
+
f"The paragraph title is ```{title_para}```."
|
101 |
+
f"Your response shall be concise and shall respect the following format:"
|
102 |
+
f"<summary>"
|
103 |
+
f"If you see that the summary that you are creating will not respect ```{max_tokens}``` tokens, find a way to make it shorter.")
|
104 |
+
generation = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k", messages=[{"role":"system","content":task},{"role":"user","content":prompt}])
|
105 |
+
res = generation.choices[0].message.content
|
106 |
+
print("****************")
|
107 |
+
print(res)
|
108 |
+
print("----")
|
109 |
+
return str(res).strip()
|
110 |
+
|
111 |
+
def transform_paragraph_into_question(self, prompt : str, title_doc : str = '',title_para : str = '') -> (str, str):
|
112 |
+
max_tokens = 150
|
113 |
+
|
114 |
+
prompt_template=(f"Your job is to create two questions about a paragraph of a document untitled ```{title_doc}```."
|
115 |
+
f"The paragraph title is ```{title_para}```."
|
116 |
+
f"If you see that the questions that you are creating will not respect ```{max_tokens}``` tokens, find a way to make them shorter."
|
117 |
+
f"If you can't create a question about the paragraph, just rephrase ```{title_para}``` so that it becomes a question."
|
118 |
+
f"Your response shall contains two questions, shall be concise and shall respect the following format:"
|
119 |
+
f"`<question1>!=;<question2>`"
|
120 |
+
f"You should not answer to the questions, just create them. Moreover, you shall include the title of the paragraph in the questions."
|
121 |
+
f"The paragraph you need to create questions about is the following :"
|
122 |
+
f"{prompt}")
|
123 |
+
generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":prompt_template}])
|
124 |
+
res = generation.choices[0].message.content
|
125 |
+
print("****************")
|
126 |
+
res = str(res).split("!=;")
|
127 |
+
if len(res) == 1:
|
128 |
+
return (res[0],"")
|
129 |
+
elif len(res) == 2:
|
130 |
+
return (res[0],res[1])
|
131 |
+
else:
|
132 |
+
return ("","")
|
133 |
+
|
134 |
+
def detect_language(self, text: str) -> str:
|
135 |
+
"""detects the language"""
|
136 |
+
template = (f"Your task consists in detecting the language of the last question or sentence of the text."
|
137 |
+
f"You should only give the two letters code of the language detected, nothing else."
|
138 |
+
f"Here is the text you are given delimited by triple backticks : ```{text}```")
|
139 |
+
generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
|
140 |
+
res = generation.choices[0].message.content
|
141 |
+
return str(res).strip()
|
142 |
+
|
143 |
+
def detect_language_v2(self, text: str) -> str:
|
144 |
+
"""detects the language"""
|
145 |
+
task = (f"Your task consists in detecting the language of the last question or sentence of the text."
|
146 |
+
f"You should only give the two letters code of the language detected, nothing else.")
|
147 |
+
generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"system","content":task},{"role":"user","content":text}])
|
148 |
+
res = generation.choices[0].message.content
|
149 |
+
return str(res).strip()
|
src/tools/pretty_print.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.model.paragraph import Paragraph
|
2 |
+
from src.model.container import Container
|
3 |
+
|
4 |
+
|
5 |
+
#function that pretty prints the paragraphs
|
6 |
+
def pretty_printer_paragraphs(paragraphs):
|
7 |
+
for p in paragraphs:
|
8 |
+
if (p.font_style == "title1"):
|
9 |
+
print(f"Titre 1 {p.text}")
|
10 |
+
elif (p.font_style == "title2"):
|
11 |
+
print(f"---> Titre 2 {p.text}")
|
12 |
+
elif (p.font_style == "title3"):
|
13 |
+
print(f"-------> Titre 3 {p.text}")
|
14 |
+
elif (p.font_style == "title4"):
|
15 |
+
print(f"-----------> Titre 4 {p.text}")
|
16 |
+
elif (p.font_style == "content"):
|
17 |
+
print(f"---------------> {p.text}")
|
18 |
+
elif (p.font_style == "code"):
|
19 |
+
print(f"----------code------------> {p.text}")
|
20 |
+
elif (p.font_style == "table"):
|
21 |
+
print(f"----------table------------> {p.text}")
|
22 |
+
|
23 |
+
def pretty_print_container_structure(container):
|
24 |
+
if container.title:
|
25 |
+
print(f"{'-'*container.level} {container.title.text}")
|
26 |
+
for p in container.paragraphs:
|
27 |
+
print(f"{'-'*container.level} {p.text}")
|
28 |
+
for c in container.children:
|
29 |
+
pretty_print_container_structure(c)
|
30 |
+
|
31 |
+
def print_all_block_indexes(container):
|
32 |
+
for b in container.blocks:
|
33 |
+
print(f'{b.index} : {b.title if b.title else ""}')
|
src/tools/reader_html.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pyquery import PyQuery as pq
|
2 |
+
from src.model.paragraph import Paragraph
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from src.tools.readers_pdf import Reader_illumio
|
5 |
+
from src.tools.table_converter import table_converter
|
6 |
+
|
7 |
+
class Reader_HTML:
|
8 |
+
def __init__(self, path):
|
9 |
+
self.path = path
|
10 |
+
self.paragraphs = self.read_html_2(path)
|
11 |
+
|
12 |
+
#without beautifulsoup but doesn't work fine
|
13 |
+
def read_html(self, path):
|
14 |
+
with open(path, 'r') as html_file:
|
15 |
+
doc = pq(html_file.read())
|
16 |
+
|
17 |
+
# Remove script and style elements
|
18 |
+
doc('script').remove()
|
19 |
+
doc('style').remove()
|
20 |
+
|
21 |
+
paragraphs = []
|
22 |
+
for index, elem in enumerate(doc('*')):
|
23 |
+
# Check if the element is a leaf (does not contain other elements)
|
24 |
+
if not pq(elem).find('*'):
|
25 |
+
text = pq(elem).text().strip()
|
26 |
+
if text:
|
27 |
+
paragraphs.append(Paragraph(text=text, font_style=elem.tag, id_ = index, page_id=1))
|
28 |
+
return paragraphs
|
29 |
+
|
30 |
+
#with beautifulsoup
|
31 |
+
def read_html_2(self,path):
|
32 |
+
HTMLFile = open(path, "r")
|
33 |
+
# Reading the file
|
34 |
+
reader = HTMLFile.read()
|
35 |
+
paragraphs = []
|
36 |
+
# Creating a BeautifulSoup object and specifying the parser
|
37 |
+
S = BeautifulSoup(reader, 'html.parser')
|
38 |
+
for tag in S(['style', 'script', 'footer', 'header', 'nav', 'aside', 'form']):
|
39 |
+
tag.decompose()
|
40 |
+
|
41 |
+
# Get all elements that do not contain other elements
|
42 |
+
leaf_elements = [elem for elem in S.body.descendants if elem.name is not None and not elem.find_all()]
|
43 |
+
paragraphs = []
|
44 |
+
for index, elem in enumerate(leaf_elements):
|
45 |
+
text = elem.get_text(strip=True, separator='\n')
|
46 |
+
if text:
|
47 |
+
p = Paragraph(text=text, font_style=elem.name, id_ = index, page_id=1)
|
48 |
+
paragraphs.append(p)
|
49 |
+
paragraphs = self.concatenate_paragraphs_with_same_font_style(paragraphs)
|
50 |
+
paragraphs = [p.rearrange_paragraph() for p in paragraphs]
|
51 |
+
return paragraphs
|
52 |
+
|
53 |
+
def concatenate_paragraphs_with_same_font_style(self,paragraphs: [Paragraph]):
|
54 |
+
i = 0
|
55 |
+
while i < len(paragraphs)-1:
|
56 |
+
if paragraphs[i].font_style == "th":
|
57 |
+
paragraphs = self.create_table(paragraphs,i)
|
58 |
+
i += 1
|
59 |
+
elif paragraphs[i].font_style == "li":
|
60 |
+
paragraphs,i = self.create_list(paragraphs,i)
|
61 |
+
i += 1
|
62 |
+
elif paragraphs[i].font_style == paragraphs[i+1].font_style:
|
63 |
+
paragraphs[i].text += "\n" + paragraphs[i+1].text
|
64 |
+
paragraphs.pop(i+1)
|
65 |
+
else:
|
66 |
+
i += 1
|
67 |
+
return paragraphs
|
68 |
+
|
69 |
+
|
70 |
+
def create_table(self, paragraphs, i: int):
|
71 |
+
table = []
|
72 |
+
titles = []
|
73 |
+
content = []
|
74 |
+
while i < len(paragraphs) and paragraphs[i].font_style == "th":
|
75 |
+
titles.append(paragraphs[i].text)
|
76 |
+
paragraphs.pop(i)
|
77 |
+
table.append(titles)
|
78 |
+
length = len(titles)
|
79 |
+
temp = 0
|
80 |
+
while i < len(paragraphs) and paragraphs[i].font_style == "td":
|
81 |
+
if temp == length:
|
82 |
+
temp = 0
|
83 |
+
content.append(paragraphs[i].text)
|
84 |
+
table.append(content)
|
85 |
+
content = []
|
86 |
+
else:
|
87 |
+
content.append(paragraphs[i].text)
|
88 |
+
paragraphs.pop(i)
|
89 |
+
temp += 1
|
90 |
+
table.append(content)
|
91 |
+
paragraphs.insert(i,Paragraph(table_converter(table),font_style="table",id_=i,page_id=1))
|
92 |
+
return paragraphs
|
93 |
+
|
94 |
+
def create_list(self, paragraphs, i: int):
|
95 |
+
list_content = []
|
96 |
+
while i < len(paragraphs) and paragraphs[i].font_style in ["ul", "ol", "li"]:
|
97 |
+
if paragraphs[i].font_style == "li":
|
98 |
+
list_content.append(paragraphs[i].text)
|
99 |
+
paragraphs.pop(i)
|
100 |
+
elif paragraphs[i].font_style in ["ul", "ol"]:
|
101 |
+
sublist, i = self.create_list(paragraphs, i+1)
|
102 |
+
list_content.append(sublist)
|
103 |
+
else:
|
104 |
+
i += 1
|
105 |
+
list_paragraph = Paragraph(text=self.format_list(list_content), font_style="list", id_=i, page_id=1)
|
106 |
+
paragraphs.insert(i, list_paragraph)
|
107 |
+
return paragraphs, i
|
108 |
+
|
109 |
+
def format_list(self,list_content):
|
110 |
+
res = ""
|
111 |
+
for i in range(len(list_content)):
|
112 |
+
if type(list_content[i]) == str:
|
113 |
+
res += f"{i+1}. {list_content[i]}\n"
|
114 |
+
else:
|
115 |
+
res += f"{i+1}. {self.format_list(list_content[i])}\n"
|
116 |
+
return res
|
117 |
+
|
118 |
+
|
src/tools/reader_pdf_tools.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def flatten(S):
|
2 |
+
if S == []:
|
3 |
+
return S
|
4 |
+
if isinstance(S[0], list):
|
5 |
+
return flatten(S[0]) + flatten(S[1:])
|
6 |
+
return S[:1] + flatten(S[1:])
|
7 |
+
|
8 |
+
def keep_int_and_floats_in_list(S):
|
9 |
+
i = 0
|
10 |
+
while i < len(S):
|
11 |
+
if isinstance(S[i], str):
|
12 |
+
S.pop(i)
|
13 |
+
else:
|
14 |
+
i+=1
|
15 |
+
return S
|
16 |
+
|
17 |
+
def group_formats(formats : list) -> list:
|
18 |
+
#create a list of lists of formats that are close to each other (0.5 difference)
|
19 |
+
formats = sorted(formats)
|
20 |
+
groups = []
|
21 |
+
current_group = []
|
22 |
+
current_format = formats[0]
|
23 |
+
for format in formats:
|
24 |
+
if format - current_format <= 0.20:
|
25 |
+
current_group.append(format)
|
26 |
+
else:
|
27 |
+
groups.append(current_group)
|
28 |
+
current_group = [format]
|
29 |
+
current_format = format
|
30 |
+
groups.append(current_group)
|
31 |
+
return groups
|
32 |
+
|
33 |
+
def find_max_list(list):
|
34 |
+
list_len = [len(i) for i in list]
|
35 |
+
return len(list) - 1 - list_len[::-1].index(max(list_len))
|
36 |
+
|
37 |
+
def find_good_key_in_dict(dict : dict, value) -> str:
|
38 |
+
for key in dict.keys():
|
39 |
+
if value in dict[key]:
|
40 |
+
return key
|
41 |
+
return None
|
42 |
+
|
43 |
+
def create_dict_and_assign_styles_from_format(formats : list) -> dict:
|
44 |
+
#create a dictionary with the format as key and the style as value
|
45 |
+
styles = {}
|
46 |
+
content_format_index = find_max_list(formats)
|
47 |
+
i = 0
|
48 |
+
for l in formats[:content_format_index]:
|
49 |
+
formats[content_format_index - i] += l
|
50 |
+
del formats[formats.index(l)]
|
51 |
+
i+=1
|
52 |
+
number_of_styles = len(formats)
|
53 |
+
styles["content"] = sorted(list(set(formats[0])))
|
54 |
+
for i in range(1,len(formats)):
|
55 |
+
styles["title"+str(number_of_styles-i)] = sorted(list(set(formats[i])))
|
56 |
+
return styles
|
src/tools/reader_word.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import docx
|
2 |
+
import os
|
3 |
+
# sys.path.append('path to app')
|
4 |
+
# import docx
|
5 |
+
# import os
|
6 |
+
# import sys
|
7 |
+
|
8 |
+
from src.model.paragraph import Paragraph
|
9 |
+
|
10 |
+
class WordReader:
|
11 |
+
|
12 |
+
def __init__(self, path):
|
13 |
+
self.path = path
|
14 |
+
self.paragraphs = self.get_word_paragraphs()
|
15 |
+
|
16 |
+
def get_word_paragraphs(self):
|
17 |
+
"""
|
18 |
+
Fetches paragraphs from a Word document.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
list: List of Paragraph objects from the document.
|
22 |
+
"""
|
23 |
+
if not os.path.exists(self.path):
|
24 |
+
raise FileNotFoundError(f"The file {self.path} does not exist.")
|
25 |
+
|
26 |
+
try:
|
27 |
+
doc = docx.Document(self.path)
|
28 |
+
paragraphs = self.to_paragraph_objects(doc.paragraphs) # Convert to Paragraph objects
|
29 |
+
return paragraphs
|
30 |
+
except Exception as e:
|
31 |
+
raise ValueError(f"Error reading the .docx file. Original error: {str(e)}")
|
32 |
+
|
33 |
+
def determine_style(self, paragraph):
|
34 |
+
"""
|
35 |
+
Determines the style of the paragraph based on its attributes.
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
str: Style of the paragraph.
|
39 |
+
"""
|
40 |
+
# Check for heading styles first
|
41 |
+
if paragraph.style.name.startswith('Heading 1'):
|
42 |
+
return "title1"
|
43 |
+
elif paragraph.style.name.startswith('Heading 2'):
|
44 |
+
return "title2"
|
45 |
+
elif paragraph.style.name.startswith('Heading 3'):
|
46 |
+
return "title3"
|
47 |
+
elif paragraph.style.name.startswith('Heading 4'):
|
48 |
+
return "title4"
|
49 |
+
elif paragraph.style.name.startswith('Heading 5'):
|
50 |
+
return "title5"
|
51 |
+
|
52 |
+
# If not a heading, check the runs within the paragraph
|
53 |
+
for run in paragraph.runs:
|
54 |
+
font = run.font
|
55 |
+
fontname = font.name
|
56 |
+
size = font.size
|
57 |
+
|
58 |
+
# Convert size to points (from twips)
|
59 |
+
if size:
|
60 |
+
size_in_points = size.pt
|
61 |
+
|
62 |
+
# Map based on font name and size as in the PDF reader
|
63 |
+
if fontname == "XFQKGD+Consolas":
|
64 |
+
return "code"
|
65 |
+
elif (size_in_points >= 9 and size_in_points < 11.5) or fontname == "Wingdings-Regular":
|
66 |
+
return "content"
|
67 |
+
# If none of the above conditions match, default to 'content'
|
68 |
+
return "content"
|
69 |
+
|
70 |
+
|
71 |
+
def to_paragraph_objects(self, doc_paragraphs):
|
72 |
+
"""
|
73 |
+
Convert docx paragraphs to Paragraph objects for further processing.
|
74 |
+
"""
|
75 |
+
paragraph_objects = []
|
76 |
+
for idx, paragraph in enumerate(doc_paragraphs):
|
77 |
+
style = self.determine_style(paragraph)
|
78 |
+
|
79 |
+
# Assuming page_id is always 1 for simplicity, change as needed.
|
80 |
+
p_obj = Paragraph(text=paragraph.text, font_style=style, id_=idx, page_id=1)
|
81 |
+
paragraph_objects.append(p_obj)
|
82 |
+
paragraphs = self.rearrange_paragraphs(paragraph_objects)
|
83 |
+
|
84 |
+
return paragraphs
|
85 |
+
|
86 |
+
|
87 |
+
def rearrange_paragraphs(self, paragraphs : [Paragraph]):
|
88 |
+
#associate paragraphs with the same font style
|
89 |
+
i = 0
|
90 |
+
while i < len(paragraphs):
|
91 |
+
paragraphs[i] = paragraphs[i].rearrange_paragraph()
|
92 |
+
i+=1
|
93 |
+
return paragraphs
|
94 |
+
|
95 |
+
|
96 |
+
def display_paragraphs(self):
|
97 |
+
"""
|
98 |
+
Prints the paragraphs from the document to the console.
|
99 |
+
"""
|
100 |
+
for paragraph in self.paragraphs:
|
101 |
+
print(paragraph.text)
|
102 |
+
print('-' * 40) # separator for clarity
|
103 |
+
|
104 |
+
# if __name__ == '__main__':
|
105 |
+
# reader = WordReader("Illumio_Core_REST_API_Developer_Guide_23.3.docx")
|
106 |
+
# reader.display_paragraphs()
|
src/tools/readers_pdf.py
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import PyPDF2
|
3 |
+
# To analyze the PDF layout and extract text
|
4 |
+
from pdfminer.high_level import extract_pages, extract_text
|
5 |
+
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
|
6 |
+
# To extract text from tables in PDF
|
7 |
+
import pdfplumber
|
8 |
+
# To extract the images from the PDFs
|
9 |
+
from PIL import Image
|
10 |
+
from pdf2image import convert_from_path
|
11 |
+
# To perform OCR to extract text from images
|
12 |
+
import pytesseract
|
13 |
+
# To remove the additional created files
|
14 |
+
import os
|
15 |
+
import pdfplumber as pdfp
|
16 |
+
|
17 |
+
from src.model.paragraph import Paragraph
|
18 |
+
from src.tools.table_converter import table_converter
|
19 |
+
from src.tools.reader_pdf_tools import *
|
20 |
+
import gradio as gr
|
21 |
+
|
22 |
+
|
23 |
+
def get_style_of_line(size : float, fontname : str):
|
24 |
+
if fontname == "XFQKGD+Consolas":
|
25 |
+
return "code"
|
26 |
+
elif (size >= 9 and size < 11.5) or fontname == "CRRYJU+Wingdings-Regular":
|
27 |
+
return "content"
|
28 |
+
elif size >= 11.5 and size <= 12.7:
|
29 |
+
return "title5"
|
30 |
+
elif size >= 12.8 and size <= 13.5:
|
31 |
+
return "title4"
|
32 |
+
elif size > 13.5 and size <= 15.5:
|
33 |
+
return "title3"
|
34 |
+
elif size > 15.5 and size <= 18.5:
|
35 |
+
return "title2"
|
36 |
+
elif size > 19 and size < 30:
|
37 |
+
return "title1"
|
38 |
+
else:
|
39 |
+
return "unknown"
|
40 |
+
|
41 |
+
class Reader:
|
42 |
+
def __init__(self, path,actual_first_page_=0, include_images=True):
|
43 |
+
self.path = path
|
44 |
+
self.paragraphs = self.pdf_manager(path, actual_first_page_, include_images=include_images)
|
45 |
+
|
46 |
+
|
47 |
+
def most_occuring_fonts(self, line_formats : list):
|
48 |
+
if line_formats != []:
|
49 |
+
min_freq = 3
|
50 |
+
font_size_freq = {i: line_formats.count(i) for i in set(line_formats) if isinstance(i, float)}
|
51 |
+
most_occuring_font_sizes = [size for size, freq in font_size_freq.items() if freq >= min_freq]
|
52 |
+
line_formats = [i for i in line_formats if i in most_occuring_font_sizes or isinstance(i, str)]
|
53 |
+
return line_formats
|
54 |
+
|
55 |
+
|
56 |
+
def text_extraction(self,element):
|
57 |
+
# Extracting the text from the in line text element
|
58 |
+
line_text = element.get_text()
|
59 |
+
# Find the formats of the text
|
60 |
+
# Initialize the list with all the formats appeared in the line of text
|
61 |
+
line_formats = []
|
62 |
+
for text_line in element:
|
63 |
+
if isinstance(text_line, LTTextContainer):
|
64 |
+
# Iterating through each character in the line of text
|
65 |
+
for character in text_line:
|
66 |
+
if isinstance(character, LTChar):
|
67 |
+
# Append the font name of the character
|
68 |
+
line_formats.append(character.fontname)
|
69 |
+
# Append the font size of the character
|
70 |
+
line_formats.append(character.size)
|
71 |
+
#find the most occuring font size and keep it. If there are more than one, keep all of them.
|
72 |
+
line_formats = self.most_occuring_fonts(line_formats)
|
73 |
+
# Find the unique font sizes and names in the line and delete the None values
|
74 |
+
format_per_line = list(set(line_formats))
|
75 |
+
# Return a tuple with the text in each line along with its format
|
76 |
+
return (line_text, format_per_line)
|
77 |
+
|
78 |
+
# Extracting tables from the page
|
79 |
+
def extract_table(self, pdf_path, page_num, table_num):
|
80 |
+
# Open the pdf file
|
81 |
+
pdf = pdfplumber.open(pdf_path)
|
82 |
+
# Find the examined page
|
83 |
+
table_page = pdf.pages[page_num]
|
84 |
+
# Extract the appropriate table
|
85 |
+
table = table_page.extract_tables()[table_num]
|
86 |
+
|
87 |
+
return table
|
88 |
+
|
89 |
+
# Create a function to check if the element is in any tables present in the page
|
90 |
+
def is_element_inside_any_table(self, element, page ,tables):
|
91 |
+
x0, y0up, x1, y1up = element.bbox
|
92 |
+
# Change the cordinates because the pdfminer counts from the botton to top of the page
|
93 |
+
y0 = page.bbox[3] - y1up
|
94 |
+
y1 = page.bbox[3] - y0up
|
95 |
+
for table in tables:
|
96 |
+
tx0, ty0, tx1, ty1 = table.bbox
|
97 |
+
if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
|
98 |
+
return True
|
99 |
+
return False
|
100 |
+
|
101 |
+
# Function to find the table for a given element
|
102 |
+
def find_table_for_element(self, element, page ,tables):
|
103 |
+
x0, y0up, x1, y1up = element.bbox
|
104 |
+
# Change the cordinates because the pdfminer counts from the botton to top of the page
|
105 |
+
y0 = page.bbox[3] - y1up
|
106 |
+
y1 = page.bbox[3] - y0up
|
107 |
+
for i, table in enumerate(tables):
|
108 |
+
tx0, ty0, tx1, ty1 = table.bbox
|
109 |
+
if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
|
110 |
+
return i # Return the index of the table
|
111 |
+
return None
|
112 |
+
|
113 |
+
# Create a function to crop the image elements from PDFs
|
114 |
+
def crop_image(self, element, pageObj):
|
115 |
+
# Get the coordinates to crop the image from PDF
|
116 |
+
[image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
|
117 |
+
# Crop the page using coordinates (left, bottom, right, top)
|
118 |
+
pageObj.mediabox.lower_left = (image_left, image_bottom)
|
119 |
+
pageObj.mediabox.upper_right = (image_right, image_top)
|
120 |
+
# Save the cropped page to a new PDF
|
121 |
+
cropped_pdf_writer = PyPDF2.PdfWriter()
|
122 |
+
cropped_pdf_writer.add_page(pageObj)
|
123 |
+
# Save the cropped PDF to a new file
|
124 |
+
with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
|
125 |
+
cropped_pdf_writer.write(cropped_pdf_file)
|
126 |
+
|
127 |
+
# Create a function to convert the PDF to images
|
128 |
+
def convert_to_images(self, input_file,):
|
129 |
+
images = convert_from_path(input_file)
|
130 |
+
image = images[0]
|
131 |
+
output_file = 'PDF_image.png'
|
132 |
+
image.save(output_file, 'PNG')
|
133 |
+
|
134 |
+
# Create a function to read text from images
|
135 |
+
def image_to_text(self, image_path):
|
136 |
+
# Read the image
|
137 |
+
img = Image.open(image_path)
|
138 |
+
# Extract the text from the image
|
139 |
+
text = pytesseract.image_to_string(img)
|
140 |
+
return text
|
141 |
+
|
142 |
+
def pdf_manager(self, pdf_path, actual_first_page=0, include_images=True):
|
143 |
+
# create a PDF file object
|
144 |
+
pdfFileObj = open(pdf_path, 'rb')
|
145 |
+
# create a PDF reader object
|
146 |
+
pdfReaded = PyPDF2.PdfReader(pdfFileObj)
|
147 |
+
number_of_pages = len(pdfReaded.pages)
|
148 |
+
# Create the dictionary to extract text from each image
|
149 |
+
text_per_page = {}
|
150 |
+
# Create a boolean variable for image detection
|
151 |
+
image_flag = False
|
152 |
+
actual_first_page = int(actual_first_page)
|
153 |
+
if actual_first_page > number_of_pages:
|
154 |
+
gr.Warning("The number of pages you want to skip is greater than the number of pages in the document. We will extract all the pages.")
|
155 |
+
page_numbers = None
|
156 |
+
else:
|
157 |
+
page_numbers = [i for i in range(actual_first_page - 1,number_of_pages)]
|
158 |
+
# We extract the pages from the PDF
|
159 |
+
for pagenum, page in enumerate(extract_pages(pdf_path,page_numbers=page_numbers)):
|
160 |
+
# Initialize the page object
|
161 |
+
pagenum = page_numbers[pagenum] if page_numbers else pagenum
|
162 |
+
pageObj = pdfReaded.pages[pagenum]
|
163 |
+
# Initialize the variables needed for the text extraction from the page
|
164 |
+
page_text = []
|
165 |
+
line_format = []
|
166 |
+
text_from_images = []
|
167 |
+
text_from_tables = []
|
168 |
+
page_content = []
|
169 |
+
# Initialize the number of the examined tables
|
170 |
+
table_in_page= -1
|
171 |
+
# Open the pdf file
|
172 |
+
pdf = pdfplumber.open(pdf_path)
|
173 |
+
# Find the examined page
|
174 |
+
page_tables = pdf.pages[pagenum]
|
175 |
+
# Find the number of tables in the page
|
176 |
+
|
177 |
+
tables = page_tables.find_tables()
|
178 |
+
if len(tables)!=0:
|
179 |
+
table_in_page = 0
|
180 |
+
|
181 |
+
# Extracting the tables of the page
|
182 |
+
for table_num in range(len(tables)):
|
183 |
+
# Extract the information of the table
|
184 |
+
table = self.extract_table(pdf_path, pagenum, table_num)
|
185 |
+
# Convert the table information in structured string format
|
186 |
+
table_string = table_converter(table)
|
187 |
+
# Append the table string into a list
|
188 |
+
text_from_tables.append(table_string)
|
189 |
+
|
190 |
+
# Find all the elements
|
191 |
+
page_elements = [(element.y1, element) for element in page._objs]
|
192 |
+
# Sort all the element as they appear in the page
|
193 |
+
page_elements.sort(key=lambda a: a[0], reverse=True)
|
194 |
+
|
195 |
+
|
196 |
+
# Find the elements that composed a page
|
197 |
+
for i,component in enumerate(page_elements):
|
198 |
+
# Extract the element of the page layout
|
199 |
+
element = component[1]
|
200 |
+
|
201 |
+
# Check the elements for tables
|
202 |
+
if table_in_page == -1:
|
203 |
+
pass
|
204 |
+
else:
|
205 |
+
if self.is_element_inside_any_table(element, page ,tables):
|
206 |
+
table_found = self.find_table_for_element(element,page ,tables)
|
207 |
+
if table_found == table_in_page and table_found != None:
|
208 |
+
page_content.append(text_from_tables[table_in_page])
|
209 |
+
page_text.append('table')
|
210 |
+
line_format.append('table')
|
211 |
+
table_in_page+=1
|
212 |
+
# Pass this iteration because the content of this element was extracted from the tables
|
213 |
+
continue
|
214 |
+
|
215 |
+
if not self.is_element_inside_any_table(element,page,tables):
|
216 |
+
|
217 |
+
# Check if the element is text element
|
218 |
+
if isinstance(element, LTTextContainer):
|
219 |
+
# Use the function to extract the text and format for each text element
|
220 |
+
(line_text, format_per_line) = self.text_extraction(element)
|
221 |
+
# Append the text of each line to the page text
|
222 |
+
page_text.append(line_text)
|
223 |
+
# Append the format for each line containing text
|
224 |
+
line_format.append(format_per_line)
|
225 |
+
page_content.append(line_text)
|
226 |
+
|
227 |
+
|
228 |
+
#Check the elements for images
|
229 |
+
if include_images:
|
230 |
+
if isinstance(element, LTFigure):
|
231 |
+
# Crop the image from PDF
|
232 |
+
self.crop_image(element, pageObj)
|
233 |
+
# Convert the croped pdf to image
|
234 |
+
self.convert_to_images('cropped_image.pdf')
|
235 |
+
# Extract the text from image
|
236 |
+
image_text = self.image_to_text('PDF_image.png')
|
237 |
+
text_from_images.append(image_text)
|
238 |
+
page_content.append(image_text)
|
239 |
+
# Add a placeholder in the text and format lists
|
240 |
+
page_text.append('image')
|
241 |
+
line_format.append('image')
|
242 |
+
# Update the flag for image detection
|
243 |
+
image_flag = True
|
244 |
+
|
245 |
+
# Create the key of the dictionary
|
246 |
+
dctkey = 'Page_'+str(pagenum)
|
247 |
+
# Add the list of list as value of the page key
|
248 |
+
text_per_page[dctkey]= [page_text, line_format, text_from_images, text_from_tables, page_content]
|
249 |
+
|
250 |
+
|
251 |
+
# Close the pdf file object
|
252 |
+
pdfFileObj.close()
|
253 |
+
|
254 |
+
# Create a list of formats for all the pages
|
255 |
+
formats = []
|
256 |
+
for p in text_per_page.values():
|
257 |
+
formats.append(p[1])
|
258 |
+
|
259 |
+
#flatten the list of lists
|
260 |
+
formats = flatten(formats)
|
261 |
+
|
262 |
+
#keep only the font sizes in the list
|
263 |
+
formats = keep_int_and_floats_in_list(formats)
|
264 |
+
|
265 |
+
#group the formats in lists of similar formats
|
266 |
+
grouped_formats = group_formats(formats)
|
267 |
+
|
268 |
+
#create a dictionary with the format as key and the style as value
|
269 |
+
styles = create_dict_and_assign_styles_from_format(grouped_formats)
|
270 |
+
|
271 |
+
#display the result on a separate file as a JSON with some indentation for better visualization
|
272 |
+
with open(file="styles.txt", mode='a') as fp:
|
273 |
+
if fp.tell() == 0:
|
274 |
+
fp.write('Document title: ' + pdf_path.split('/')[-1] + '\n') if '/' in pdf_path else fp.write('Document title: ' + pdf_path.split('\\')[-1] + '\n')
|
275 |
+
else:
|
276 |
+
fp.write('\nDocument title: ' + pdf_path.split('/')[-1] + '\n') if '/' in pdf_path else fp.write('\nDocument title: ' + pdf_path.split('\\')[-1] + '\n')
|
277 |
+
json.dump(styles, fp, indent=4)
|
278 |
+
|
279 |
+
# Delete the additional files created if image is detected
|
280 |
+
if image_flag:
|
281 |
+
os.remove('cropped_image.pdf')
|
282 |
+
os.remove('PDF_image.png')
|
283 |
+
|
284 |
+
#beginning of the paragraph extraction
|
285 |
+
paragraphs = []
|
286 |
+
for index, page in enumerate(text_per_page.values()):
|
287 |
+
content_format = page[1]
|
288 |
+
j = 0
|
289 |
+
while j+1 < len(content_format):
|
290 |
+
actual_format = content_format[j]
|
291 |
+
n_of_fontsizes = len(list(i for i in actual_format if isinstance(i, int) or isinstance(i, float)))
|
292 |
+
if n_of_fontsizes > 1:
|
293 |
+
actual_format = max(keep_int_and_floats_in_list(actual_format))
|
294 |
+
actual_format = find_good_key_in_dict(styles,actual_format)
|
295 |
+
elif n_of_fontsizes == 1:
|
296 |
+
actual_format = keep_int_and_floats_in_list(actual_format)[0]
|
297 |
+
actual_format = find_good_key_in_dict(styles,actual_format)
|
298 |
+
elif n_of_fontsizes == 0 and actual_format == "table":
|
299 |
+
actual_format = "table"
|
300 |
+
else:
|
301 |
+
actual_format = "content"
|
302 |
+
#try to find the good format if the current result seems wrong
|
303 |
+
#changes depending on the document
|
304 |
+
if len(page[4][j]) > 150 and "title" in actual_format:
|
305 |
+
actual_format = "content"
|
306 |
+
paragraph = Paragraph(text=page[4][j],font_style=actual_format,id_=j,page_id=index)
|
307 |
+
paragraphs.append(paragraph)
|
308 |
+
j+=1
|
309 |
+
|
310 |
+
paragraphs = self.concatenate_paragraphs(paragraphs, pdf_path.split('/')[-1]) if '/' in pdf_path else self.concatenate_paragraphs(paragraphs, pdf_path.split('\\')[-1])
|
311 |
+
return paragraphs
|
312 |
+
|
313 |
+
def concatenate_paragraphs(self, paragraphs, doc_title):
|
314 |
+
concatenated_paragraphs = []
|
315 |
+
i = 0
|
316 |
+
actual_page_id = paragraphs[0].page_id
|
317 |
+
while i < len(paragraphs):
|
318 |
+
p = paragraphs[i]
|
319 |
+
if p.blank or "REST API Developer Guide 23.3" in p.text or "x! illumio" in p.text:
|
320 |
+
i+=1
|
321 |
+
continue
|
322 |
+
if (p.page_id != actual_page_id) and doc_title == "Illumio_Core_REST_API_Developer_Guide_23.3.pdf" and (not p.font_style == "table" and not "title" in p.font_style):
|
323 |
+
i+=2
|
324 |
+
actual_page_id = p.page_id
|
325 |
+
continue
|
326 |
+
if not concatenated_paragraphs:
|
327 |
+
concatenated_paragraphs.append(p)
|
328 |
+
elif p.font_style != concatenated_paragraphs[-1].font_style:
|
329 |
+
if (p.font_style == "table" and concatenated_paragraphs[-1].font_style == "content") \
|
330 |
+
or (p.font_style == "content" and concatenated_paragraphs[-1].font_style == "table"):
|
331 |
+
concatenated_paragraphs[-1].text += '\n' + p.text
|
332 |
+
else:
|
333 |
+
concatenated_paragraphs.append(p)
|
334 |
+
else:
|
335 |
+
if "title" in p.font_style:
|
336 |
+
concatenated_paragraphs[-1].text += ' : ' + p.text
|
337 |
+
concatenated_paragraphs[-1].text = concatenated_paragraphs[-1].text.replace('\n','').replace('\r','')
|
338 |
+
else:
|
339 |
+
concatenated_paragraphs[-1].text += '\n' + p.text
|
340 |
+
i+=1
|
341 |
+
return concatenated_paragraphs
|
342 |
+
|
343 |
+
|
344 |
+
class Reader_illumio:
|
345 |
+
def __init__(self, path):
|
346 |
+
self.path = path
|
347 |
+
self.paragraphs = self.get_pdf_paragraphs(path)
|
348 |
+
|
349 |
+
def skip_header(self, dictionary):
|
350 |
+
i = 0
|
351 |
+
if "Illumio_Core_REST_API_Developer_Guide_23.3" in self.path and not (dictionary[i]["chars"][0]["size"] > 19 and dictionary[i]["chars"][0]["size"] < 30):
|
352 |
+
i+=2
|
353 |
+
return i
|
354 |
+
|
355 |
+
|
356 |
+
def get_pdf_paragraphs(self,path):
|
357 |
+
pdf_to_read = self.extract_all_lines_from_the_doc(path)
|
358 |
+
paragraphs = []
|
359 |
+
j = 0
|
360 |
+
while j < len(pdf_to_read):
|
361 |
+
dictionary = pdf_to_read[j]["content"]
|
362 |
+
tables = pdf_to_read[j]["tables"]
|
363 |
+
i = self.skip_header(dictionary)
|
364 |
+
table_count = 0
|
365 |
+
while i < len(dictionary):
|
366 |
+
# print(f"{dictionary[i]['chars'][0]}")
|
367 |
+
if(dictionary[i]["text"].startswith("RESTAPIDeveloperGuide")):
|
368 |
+
i+=1
|
369 |
+
continue
|
370 |
+
if (self.check_if_already_in_table(dictionary[i]['chars'][0],tables) == False):
|
371 |
+
p = Paragraph(dictionary[i]["text"],font_style=get_style_of_line(dictionary[i]["chars"][0]["size"],dictionary[i]["chars"][0]["fontname"]),id_=i,page_id=pdf_to_read[j]["page_number"])
|
372 |
+
if(i != len(dictionary)-1):
|
373 |
+
while((dictionary[i+1]["chars"][0]["size"] == dictionary[i]["chars"][-1]["size"] and dictionary[i+1]["chars"][0]["fontname"] == dictionary[i]["chars"][-1]["fontname"]) and self.check_if_already_in_table(dictionary[i+1]['chars'][0],tables) == False):
|
374 |
+
p.text += " " + dictionary[i+1]["text"]
|
375 |
+
i += 1
|
376 |
+
else:
|
377 |
+
p.text = dictionary[i]["text"]
|
378 |
+
#print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
|
379 |
+
i += 1
|
380 |
+
# print(f'{p.page_id} : {p.font_style} ->>>>> {p.text}')
|
381 |
+
paragraphs.append(p)
|
382 |
+
else:
|
383 |
+
p = Paragraph(table_converter(tables[table_count].extract()),font_style="table",id_=i,page_id=pdf_to_read[j]["page_number"])
|
384 |
+
paragraphs.append(p)
|
385 |
+
i = self.skip_out_table(dictionary,i,tables[table_count])
|
386 |
+
table_count += 1
|
387 |
+
j += 1
|
388 |
+
paragraphs = self.rearrange_paragraphs(paragraphs)
|
389 |
+
return paragraphs
|
390 |
+
|
391 |
+
def rearrange_paragraphs(self, paragraphs : [Paragraph]):
|
392 |
+
#associate paragraphs with the same font style
|
393 |
+
i = 0
|
394 |
+
while i < len(paragraphs):
|
395 |
+
paragraphs[i] = paragraphs[i].rearrange_paragraph()
|
396 |
+
i+=1
|
397 |
+
return paragraphs
|
398 |
+
|
399 |
+
def extract_all_lines_from_the_doc(self,path):
|
400 |
+
lines_of_doc = []
|
401 |
+
with open(path, 'rb') as f:
|
402 |
+
reader = pdfp.PDF(f)
|
403 |
+
if "Illumio_Core_REST_API_Developer_Guide_23.3" in path:
|
404 |
+
skip_table_of_contents = reader.pages[8:]
|
405 |
+
j = 0
|
406 |
+
while j < len(skip_table_of_contents):
|
407 |
+
lines_of_doc.append({"page_number": j+9, "content": skip_table_of_contents[j].extract_text_lines(), "tables": skip_table_of_contents[j].find_tables()})
|
408 |
+
j += 1
|
409 |
+
else:
|
410 |
+
for page in reader.pages:
|
411 |
+
lines_of_doc.append({"page_number": page.page_number, "content": page.extract_text_lines(), "tables": page.find_tables()})
|
412 |
+
return lines_of_doc
|
413 |
+
|
414 |
+
def check_if_already_in_table(self,line,tables):
|
415 |
+
for table in tables:
|
416 |
+
if table.bbox[1] <= line["top"] <= table.bbox[3]:
|
417 |
+
return True
|
418 |
+
return False
|
419 |
+
|
420 |
+
def skip_out_table(self,dictionary,index,table):
|
421 |
+
i = index
|
422 |
+
while i < len(dictionary):
|
423 |
+
if self.check_if_already_in_table(dictionary[i]['chars'][0],tables=[table]) == True:
|
424 |
+
i += 1
|
425 |
+
else:
|
426 |
+
break
|
427 |
+
return i
|
428 |
+
|
src/tools/retriever.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.model.block import Block
|
2 |
+
from src.model.doc import Doc
|
3 |
+
from src.tools.llm import LlmAgent
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
class Retriever:
|
7 |
+
def __init__(self, doc : Doc = None, collection = None, llmagent : LlmAgent = None):
|
8 |
+
if doc != None:
|
9 |
+
blocks_good_format: [Block] = doc.blocks
|
10 |
+
self.collection = collection
|
11 |
+
gr.Info("Please wait while the database is being created")
|
12 |
+
for block in blocks_good_format:
|
13 |
+
if len(block.content) > 4500:
|
14 |
+
new_blocks = block.separate_1_block_in_n(max_size=4500)
|
15 |
+
for new_block in new_blocks:
|
16 |
+
summary = llmagent.summarize_paragraph_v2(prompt=new_block.content,title_doc=doc.title,title_para=block.title)
|
17 |
+
if "<summary>" in summary:
|
18 |
+
summary = summary.split("<summary>")[1]
|
19 |
+
self.collection.add(
|
20 |
+
documents=[summary],
|
21 |
+
ids=[new_block.index],
|
22 |
+
metadatas=[new_block.to_dict()]
|
23 |
+
)
|
24 |
+
else:
|
25 |
+
summary = llmagent.summarize_paragraph_v2(prompt=block.content,title_doc=doc.title,title_para=block.title)
|
26 |
+
if "<summary>" in summary:
|
27 |
+
summary = summary.split("<summary>")[1]
|
28 |
+
self.collection.add(
|
29 |
+
documents=[summary],
|
30 |
+
ids=[block.index],
|
31 |
+
metadatas=[block.to_dict()]
|
32 |
+
)
|
33 |
+
gr.Info(f"The collection {collection.name} has been added to the database")
|
34 |
+
else:
|
35 |
+
self.collection = collection
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
def similarity_search(self, queries: str) -> {}:
|
40 |
+
res = self.collection.query(query_texts=queries,n_results=6)
|
41 |
+
block_dict_sources = res['metadatas'][0]
|
42 |
+
distances = res['distances'][0]
|
43 |
+
blocks = []
|
44 |
+
for bd, d in zip(block_dict_sources, distances):
|
45 |
+
b = Block().from_dict(bd)
|
46 |
+
b.distance = d
|
47 |
+
blocks.append(b)
|
48 |
+
return blocks
|
49 |
+
|
src/tools/table_converter.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Convert table into appropriate fromat
|
2 |
+
|
3 |
+
def table_converter(table):
|
4 |
+
table_string = ''
|
5 |
+
# Iterate through each row of the table
|
6 |
+
for row_num in range(len(table)):
|
7 |
+
row = table[row_num]
|
8 |
+
# Remove the line breaker from the wrapted texts
|
9 |
+
cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
|
10 |
+
# Convert the table into a string
|
11 |
+
table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
|
12 |
+
# Removing the last line break
|
13 |
+
table_string = table_string[:-1]
|
14 |
+
return table_string
|
src/view/__pycache__/view.cpython-311.pyc
ADDED
Binary file (17.5 kB). View file
|
|
src/view/view.py
ADDED
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from src.control.control import Chatbot
|
3 |
+
from chromadb.utils import embedding_functions
|
4 |
+
import os
|
5 |
+
|
6 |
+
def run(ctrl: Chatbot, config: {}):
|
7 |
+
with gr.Blocks() as qna:
|
8 |
+
with gr.Row():
|
9 |
+
with gr.Column():
|
10 |
+
pass
|
11 |
+
|
12 |
+
with gr.Column(scale=10):
|
13 |
+
gr.Markdown(config['title'])
|
14 |
+
page_start_warning = gr.Markdown("<center>⚠️ If your document starts with a front cover and/or a table of contents, please enter the page number of the ⚠️ first page with real content.<center/>")
|
15 |
+
actual_page_start = gr.Number(
|
16 |
+
label="Start page (default = 1)",
|
17 |
+
visible=True,
|
18 |
+
interactive=True,
|
19 |
+
container=True,
|
20 |
+
value=1,
|
21 |
+
)
|
22 |
+
|
23 |
+
include_images_btn = gr.Checkbox(
|
24 |
+
label="Analyse text from images. This option is definitely slower, particularly on big documents. (ONLY for .pdf)",
|
25 |
+
value=False,
|
26 |
+
visible=True,
|
27 |
+
container=True,
|
28 |
+
)
|
29 |
+
|
30 |
+
input_doc_comp = gr.File(
|
31 |
+
label="Upload a file",
|
32 |
+
scale=1,
|
33 |
+
min_width=100,
|
34 |
+
)
|
35 |
+
|
36 |
+
histo_text_comp = gr.Chatbot(
|
37 |
+
visible=False,
|
38 |
+
value=[],
|
39 |
+
)
|
40 |
+
input_text_comp = gr.Textbox(
|
41 |
+
label="",
|
42 |
+
lines=1,
|
43 |
+
visible=False,
|
44 |
+
max_lines=3,
|
45 |
+
interactive=True,
|
46 |
+
placeholder="Posez votre question ici",
|
47 |
+
)
|
48 |
+
|
49 |
+
clear_btn = gr.Button("Clear Chat", visible=False)
|
50 |
+
|
51 |
+
input_example_comp = gr.Radio(
|
52 |
+
label="Examples",
|
53 |
+
choices=config['examples'].values(),
|
54 |
+
value="",
|
55 |
+
visible=False,
|
56 |
+
)
|
57 |
+
|
58 |
+
source_text_comp = []
|
59 |
+
for i in range(4):
|
60 |
+
source_text_comp.append(gr.Textbox(
|
61 |
+
lines=4,
|
62 |
+
max_lines=4,
|
63 |
+
interactive=False,
|
64 |
+
visible=False,
|
65 |
+
))
|
66 |
+
upload_another_doc_btn = gr.Button("Upload another document", visible=False)
|
67 |
+
|
68 |
+
open_ai_embedding = embedding_functions.OpenAIEmbeddingFunction(api_key=os.environ['OPENAI_API_KEY'], model_name="text-embedding-ada-002")
|
69 |
+
with gr.Column(scale=7):
|
70 |
+
collections_list = gr.Radio(choices=[a.name for a in ctrl.client_db.list_collections()],
|
71 |
+
label="Current collections in the database",
|
72 |
+
visible=True,
|
73 |
+
info="Choose a collection to query."
|
74 |
+
)
|
75 |
+
delete_database_btn = gr.Button("Delete current collection", visible=False)
|
76 |
+
|
77 |
+
def input_doc_fn(input_doc_, include_images_, actual_page_start_):
|
78 |
+
result = ctrl.upload_doc(input_doc_,include_images_, actual_page_start_)
|
79 |
+
if result == True:
|
80 |
+
return {
|
81 |
+
input_doc_comp: gr.update(visible=False),
|
82 |
+
input_text_comp: gr.update(visible=True),
|
83 |
+
input_example_comp: gr.update(visible=True),
|
84 |
+
clear_btn: gr.update(visible=True),
|
85 |
+
include_images_btn: gr.update(visible=False,value=include_images_),
|
86 |
+
delete_database_btn: gr.update(visible=True),
|
87 |
+
upload_another_doc_btn: gr.update(visible=True),
|
88 |
+
collections_list: gr.update(choices=[a.name for a in ctrl.client_db.list_collections()],value=ctrl.retriever.collection.name),
|
89 |
+
page_start_warning: gr.update(visible=False),
|
90 |
+
actual_page_start: gr.update(visible=False),
|
91 |
+
}
|
92 |
+
else:
|
93 |
+
gr.Warning("File extension not supported. Only .docx, .pdf and .html are supported.")
|
94 |
+
return {
|
95 |
+
input_doc_comp: gr.update(visible=True),
|
96 |
+
input_text_comp: gr.update(visible=False),
|
97 |
+
input_example_comp: gr.update(visible=False),
|
98 |
+
clear_btn: gr.update(visible=False),
|
99 |
+
include_images_btn: gr.update(visible=True,value=include_images_),
|
100 |
+
page_start_warning: gr.update(visible=True),
|
101 |
+
actual_page_start: gr.update(visible=True, value=1),
|
102 |
+
}
|
103 |
+
|
104 |
+
def input_file_clear():
|
105 |
+
update_ = {
|
106 |
+
input_doc_comp: gr.update(visible=True, value=None),
|
107 |
+
clear_btn: gr.update(visible=False),
|
108 |
+
input_text_comp: gr.update(value='', visible=False),
|
109 |
+
histo_text_comp: gr.update(value='', visible=False),
|
110 |
+
input_example_comp: gr.update(value='', visible=False),
|
111 |
+
include_images_btn: gr.update(visible=True),
|
112 |
+
upload_another_doc_btn: gr.update(visible=False),
|
113 |
+
delete_database_btn: gr.update(visible=True),
|
114 |
+
page_start_warning: gr.update(visible=True),
|
115 |
+
actual_page_start: gr.update(visible=True, value=1),
|
116 |
+
collections_list: gr.update(value=None, choices=[a.name for a in ctrl.client_db.list_collections()]),
|
117 |
+
}
|
118 |
+
for i in range(4):
|
119 |
+
update_[source_text_comp[i]] = gr.update(visible=False, value='hello')
|
120 |
+
return update_
|
121 |
+
|
122 |
+
|
123 |
+
def input_text_fn1(input_text_, histo_text_):
|
124 |
+
histo_text_.append((input_text_, None))
|
125 |
+
update_ = {
|
126 |
+
histo_text_comp: gr.update(visible=True, value=histo_text_),
|
127 |
+
input_example_comp: gr.update(visible=False,),
|
128 |
+
}
|
129 |
+
for i in range(4):
|
130 |
+
update_[source_text_comp[i]] = gr.update(visible=False)
|
131 |
+
return update_
|
132 |
+
|
133 |
+
def input_text_fn2(input_text_, histo_text_):
|
134 |
+
answer, sources = ctrl.get_response(query=input_text_, histo=histo_text_)
|
135 |
+
histo_text_[-1] = (input_text_, answer)
|
136 |
+
update_ = {
|
137 |
+
histo_text_comp: gr.update(value=histo_text_),
|
138 |
+
input_text_comp: gr.update(value=''),
|
139 |
+
}
|
140 |
+
for i in range(min(len(sources), 3)):
|
141 |
+
s = sources[i]
|
142 |
+
if i != 0:
|
143 |
+
prev = sources[i - 1]
|
144 |
+
if prev.index == s.index:
|
145 |
+
continue
|
146 |
+
source_label = f'{s.index} {s.title} score = {s.distance_str}'
|
147 |
+
source_text = s.content
|
148 |
+
update_[source_text_comp[i]] = gr.update(visible=True, value=source_text, label=source_label)
|
149 |
+
return update_
|
150 |
+
|
151 |
+
def input_example_fn(input_example_, histo_text_):
|
152 |
+
histo_text_.append((input_example_, None))
|
153 |
+
update_ = {
|
154 |
+
input_text_comp: gr.update(value=input_example_),
|
155 |
+
histo_text_comp: gr.update(visible=True, value=histo_text_),
|
156 |
+
input_example_comp: gr.update(visible=False, value=''),
|
157 |
+
}
|
158 |
+
for i in range(4):
|
159 |
+
update_[source_text_comp[i]] = gr.update(visible=False)
|
160 |
+
return update_
|
161 |
+
|
162 |
+
def clear_fn():
|
163 |
+
update_ = {
|
164 |
+
input_text_comp: gr.update(value=''),
|
165 |
+
histo_text_comp: gr.update(value='', visible=False),
|
166 |
+
input_example_comp: gr.update(value='', visible=True),
|
167 |
+
upload_another_doc_btn: gr.update(visible=True),
|
168 |
+
}
|
169 |
+
for i in range(4):
|
170 |
+
update_[source_text_comp[i]] = gr.update(visible=False, value='hello')
|
171 |
+
return update_
|
172 |
+
|
173 |
+
def list_all_chroma_collections():
|
174 |
+
update = {
|
175 |
+
collections_list: gr.update(choices=[a.name for a in ctrl.client_db.list_collections()]),
|
176 |
+
}
|
177 |
+
return update
|
178 |
+
|
179 |
+
def change_collection(collection_name):
|
180 |
+
ctrl.retriever.collection = ctrl.client_db.get_collection(collection_name, embedding_function=open_ai_embedding)
|
181 |
+
return {
|
182 |
+
delete_database_btn: gr.update(visible=True),
|
183 |
+
input_doc_comp: gr.update(visible=False,value=None),
|
184 |
+
input_text_comp: gr.update(visible=True, value=''),
|
185 |
+
input_example_comp: gr.update(visible=True),
|
186 |
+
clear_btn: gr.update(visible=True),
|
187 |
+
collections_list: gr.update(choices=[a.name for a in ctrl.client_db.list_collections()]),
|
188 |
+
include_images_btn: gr.update(visible=False),
|
189 |
+
histo_text_comp: gr.update(visible=False, value=''),
|
190 |
+
upload_another_doc_btn: gr.update(visible=True),
|
191 |
+
actual_page_start: gr.update(visible=False),
|
192 |
+
page_start_warning: gr.update(visible=False),
|
193 |
+
}
|
194 |
+
|
195 |
+
def delete_curr_database():
|
196 |
+
ctrl.client_db.delete_collection(ctrl.retriever.collection.name)
|
197 |
+
gr.Info(f"Collection {ctrl.retriever.collection.name} deleted from the database")
|
198 |
+
return {
|
199 |
+
delete_database_btn: gr.update(visible=False),
|
200 |
+
input_doc_comp: gr.update(visible=True,value=None),
|
201 |
+
input_text_comp: gr.update(visible=False, value=''),
|
202 |
+
input_example_comp: gr.update(visible=False),
|
203 |
+
clear_btn: gr.update(visible=False),
|
204 |
+
collections_list: gr.update(choices=[a.name for a in ctrl.client_db.list_collections()]),
|
205 |
+
include_images_btn: gr.update(visible=True),
|
206 |
+
histo_text_comp: gr.update(visible=False, value=''),
|
207 |
+
upload_another_doc_btn: gr.update(visible=False),
|
208 |
+
actual_page_start: gr.update(visible=True, value=1),
|
209 |
+
page_start_warning: gr.update(visible=True),
|
210 |
+
}
|
211 |
+
|
212 |
+
upload_another_doc_btn.click(input_file_clear,
|
213 |
+
inputs=None,
|
214 |
+
outputs=[collections_list, page_start_warning, actual_page_start, input_doc_comp, input_text_comp, input_example_comp, clear_btn, include_images_btn, histo_text_comp, delete_database_btn,upload_another_doc_btn, source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
|
215 |
+
|
216 |
+
delete_database_btn.click(delete_curr_database,
|
217 |
+
inputs=None,
|
218 |
+
outputs=[page_start_warning, actual_page_start, delete_database_btn, input_doc_comp, input_text_comp, input_example_comp, clear_btn, collections_list, include_images_btn, histo_text_comp, upload_another_doc_btn])
|
219 |
+
|
220 |
+
collections_list.input(change_collection,
|
221 |
+
inputs=[collections_list],
|
222 |
+
outputs=[actual_page_start, page_start_warning, collections_list, input_text_comp, input_example_comp, clear_btn, include_images_btn, histo_text_comp, input_doc_comp, delete_database_btn,upload_another_doc_btn])
|
223 |
+
|
224 |
+
input_doc_comp \
|
225 |
+
.upload(input_doc_fn,
|
226 |
+
inputs=[input_doc_comp, include_images_btn, actual_page_start],
|
227 |
+
outputs=[page_start_warning, actual_page_start, input_doc_comp, input_text_comp,upload_another_doc_btn,
|
228 |
+
input_example_comp, include_images_btn, clear_btn, histo_text_comp, delete_database_btn,collections_list, source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
|
229 |
+
.then(list_all_chroma_collections,
|
230 |
+
inputs=None,
|
231 |
+
outputs=[collections_list])
|
232 |
+
|
233 |
+
input_doc_comp \
|
234 |
+
.clear(input_file_clear,
|
235 |
+
inputs=None,
|
236 |
+
outputs=[page_start_warning, actual_page_start, input_doc_comp, clear_btn, upload_another_doc_btn, input_text_comp, histo_text_comp, input_example_comp, include_images_btn, delete_database_btn,
|
237 |
+
source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
|
238 |
+
|
239 |
+
input_text_comp \
|
240 |
+
.submit(input_text_fn1,
|
241 |
+
inputs=[input_text_comp, histo_text_comp],
|
242 |
+
outputs=[histo_text_comp, input_example_comp,
|
243 |
+
source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
|
244 |
+
.then(input_text_fn2,
|
245 |
+
inputs=[input_text_comp, histo_text_comp],
|
246 |
+
outputs=[input_text_comp, histo_text_comp,
|
247 |
+
source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
|
248 |
+
input_example_comp \
|
249 |
+
.input(input_example_fn,
|
250 |
+
inputs=[input_example_comp, histo_text_comp],
|
251 |
+
outputs=[input_text_comp, histo_text_comp, input_example_comp,
|
252 |
+
source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
|
253 |
+
.then(input_text_fn2,
|
254 |
+
inputs=[input_text_comp, histo_text_comp],
|
255 |
+
outputs=[input_text_comp, histo_text_comp,
|
256 |
+
source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
|
257 |
+
clear_btn.click(clear_fn,
|
258 |
+
inputs=None,
|
259 |
+
outputs=[input_text_comp, histo_text_comp, input_example_comp,upload_another_doc_btn,
|
260 |
+
source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
|
261 |
+
|
262 |
+
return qna
|
styles.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Document title: Defoe_RobinsonCrusoe1.pdf
|
2 |
+
{
|
3 |
+
"content": [
|
4 |
+
11.0,
|
5 |
+
13.300000000000011,
|
6 |
+
15.999999999999943,
|
7 |
+
15.999999999999986,
|
8 |
+
16.0,
|
9 |
+
16.000000000000007,
|
10 |
+
16.00000000000003
|
11 |
+
],
|
12 |
+
"title2": [
|
13 |
+
23.0
|
14 |
+
],
|
15 |
+
"title1": [
|
16 |
+
27.600000000000023
|
17 |
+
]
|
18 |
+
}
|