Spaces:

Hexamind
/

Document_QnA

Runtime error

App Files Files Community

Quent1Fvr commited on Nov 23, 2023

Commit

e2e8616

1 Parent(s): 91c90f8

v2.

Browse files

Files changed (47) hide show

.gitattributes +3 -0
.gitignore +179 -0
__pycache__/config.cpython-311.pyc +0 -0
__pycache__/config_key.cpython-311.pyc +0 -0
app.py +27 -0
config.py +19 -0
config_key.py +1 -0
requirements.txt +0 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/__pycache__/config_key.cpython-311.pyc +0 -0
src/control/__init__.py +0 -0
src/control/__pycache__/__init__.cpython-311.pyc +0 -0
src/control/__pycache__/control.cpython-311.pyc +0 -0
src/control/control.py +116 -0
src/model/__init__.py +0 -0
src/model/__pycache__/__init__.cpython-311.pyc +0 -0
src/model/__pycache__/block.cpython-311.pyc +0 -0
src/model/__pycache__/container.cpython-311.pyc +0 -0
src/model/__pycache__/doc.cpython-311.pyc +0 -0
src/model/__pycache__/paragraph.cpython-311.pyc +0 -0
src/model/block.py +58 -0
src/model/container.py +112 -0
src/model/doc.py +79 -0
src/model/paragraph.py +39 -0
src/tools/__init__.py +0 -0
src/tools/__pycache__/__init__.cpython-311.pyc +0 -0
src/tools/__pycache__/index_creation.cpython-311.pyc +0 -0
src/tools/__pycache__/llm.cpython-311.pyc +0 -0
src/tools/__pycache__/reader_html.cpython-311.pyc +0 -0
src/tools/__pycache__/reader_pdf_tools.cpython-311.pyc +0 -0
src/tools/__pycache__/reader_word.cpython-311.pyc +0 -0
src/tools/__pycache__/readers_pdf.cpython-311.pyc +0 -0
src/tools/__pycache__/retriever.cpython-311.pyc +0 -0
src/tools/__pycache__/table_converter.cpython-311.pyc +0 -0
src/tools/index_creation.py +67 -0
src/tools/llm.py +149 -0
src/tools/pretty_print.py +33 -0
src/tools/reader_html.py +118 -0
src/tools/reader_pdf_tools.py +56 -0
src/tools/reader_word.py +106 -0
src/tools/readers_pdf.py +428 -0
src/tools/retriever.py +49 -0
src/tools/table_converter.py +14 -0
src/view/__pycache__/view.cpython-311.pyc +0 -0
src/view/view.py +262 -0
styles.txt +18 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+env/lib/python3.10/site-packages/*.so filter=lfs diff=lfs merge=lfs -text
+env/lib/python3.10/site-packages/*.dylib filter=lfs diff=lfs merge=lfs -text
+env/lib/python3.10/site-packages/**/*.js.map filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,179 @@

+config_key.py
+#library package
+sqlite_updated/
+#Test folder + files
+data/Test/
+test.py
+test_read.py
+styles.txt
+#database folder
+database/
+database_structure/
+database_word/
+Ilumio_chatbot/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+.env
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (1.41 kB). View file

__pycache__/config_key.cpython-311.pyc ADDED Viewed

Binary file (239 Bytes). View file

app.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+from config import *
+from src.tools.llm import LlmAgent
+import src.view.view as view
+from src.control.control import Chatbot
+import chromadb
+from src.tools.retriever import Retriever
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+if not "OPENAI_API_KEY" in os.environ:
+    from config_key import OPENAI_API_KEY
+    os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
+llm_model = "gpt-4"
+llm = LlmAgent(llm_model=llm_model)
+if not os.path.exists("database_structure/"):
+    os.makedirs("database_structure/")
+client_db = chromadb.PersistentClient("database_structure/")
+chat = Chatbot(client_db=client_db,retriever=Retriever(llmagent=llm),llm_agent=llm)
+ilumio_qna = view.run(ctrl=chat, config=view_config)
+ilumio_qna.queue().launch()

config.py ADDED Viewed

	@@ -0,0 +1,19 @@

+content_language = 'en'
+plan_language = 'en'
+content_en_path_real = "data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
+content_test = "data/Test/Illumio_product_brief.pdf"
+content_python = "data/cours-python_crop.docx"
+content_html = "data/Test/list.html"
+content_data_analyst = "data\Test\Data_Analyst_chez_Stockly.pdf"
+content_test_epita = "data\Test\Test_epita.pdf"
+examples = {"Question_1": "What is the max_results parameter for async traffic queries ?",
+            "Question_2": "How can I use the Public Experimental Provisioning API to determine if a specific set of objects can be provisioned?",
+            "Question_3": "Explain the potential challenges and workarounds when using json-query with the curl -i option. Why might this combination lead to errors?",
+}
+view_config = {
+    'title': "<h1 style=text-align:center;font-size:4.5em;background-image:linear-gradient(45deg,#f3ec78,#af4261);background-color:red;background-size:100%;background-repeat:repeat;-webkit-background-clip:text;-webkit-text-fill-color:transparent;-moz-background-clip:text;-moz-text-fill-color:transparent;font-weight:bold;margin-top:4%;padding-bottom:1%>Document QnA</h1>",
+    'examples': examples,
+}

config_key.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_API_KEY = "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"

requirements.txt ADDED Viewed

Binary file (5.51 kB). View file

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (172 Bytes). View file

src/__pycache__/config_key.cpython-311.pyc ADDED Viewed

Binary file (269 Bytes). View file

src/control/__init__.py ADDED Viewed

File without changes

src/control/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (180 Bytes). View file

src/control/__pycache__/control.cpython-311.pyc ADDED Viewed

Binary file (7.87 kB). View file

src/control/control.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import chromadb
+from src.tools.retriever import Retriever
+from src.tools.llm import LlmAgent
+from src.model.block import Block
+from src.model.doc import Doc
+from chromadb.utils import embedding_functions
+import gradio as gr
+class Chatbot:
+    def __init__(self, llm_agent : LlmAgent = None, retriever: Retriever = None, client_db=None):
+        self.retriever = retriever
+        self.llm = llm_agent
+        self.client_db = client_db
+    def get_response(self, query, histo):
+        histo_conversation, histo_queries = self._get_histo(histo)
+        language_of_query = self.llm.detect_language_v2(query).lower()
+        queries = self.llm.translate_v2(histo_queries)
+        if "en" in language_of_query:
+            language_of_query = "en"
+        else:
+            language_of_query = "fr"
+        block_sources = self.retriever.similarity_search(queries=queries)
+        block_sources = self._select_best_sources(block_sources)
+        sources_contents = [f"Paragraph title : {s.title}\n-----\n{s.content}" if s.title else f"Paragraph {s.index}\n-----\n{s.content}" for s in block_sources]
+        context = '\n'.join(sources_contents)
+        i = 1
+        while (len(context) + len(histo_conversation) > 15000) and i < len(sources_contents):
+            context = "\n".join(sources_contents[:-i])
+            i += 1
+        answer = self.llm.generate_paragraph_v2(query=query, histo=histo_conversation, context=context, language=language_of_query)
+        answer = self._clean_chatgpt_answer(answer)
+        return answer, block_sources
+    @staticmethod
+    def  _select_best_sources(sources: [Block], delta_1_2=0.15, delta_1_n=0.3, absolute=1.2, alpha=0.9) -> [Block]:
+        """
+        Select the best sources: not far from the very best, not far from the last selected, and not too bad per se
+        """
+        best_sources = []
+        for idx, s in enumerate(sources):
+            if idx == 0 \
+                    or (s.distance - sources[idx - 1].distance < delta_1_2
+                        and s.distance - sources[0].distance < delta_1_n) \
+                    or s.distance < absolute:
+                best_sources.append(s)
+                delta_1_2 *= alpha
+                delta_1_n *= alpha
+                absolute *= alpha
+            else:
+                break
+        return best_sources
+    @staticmethod
+    def _get_histo(histo: [(str, str)]) -> (str, str):
+        histo_conversation = ""
+        histo_queries = ""
+        for (query, answer) in histo[-5:]:
+            histo_conversation += f'user: {query} \n bot: {answer}\n'
+            histo_queries += query + '\n'
+        return histo_conversation[:-1], histo_queries
+    @staticmethod
+    def _clean_answer(answer: str) -> str:
+        print(answer)
+        answer = answer.strip('bot:')
+        while answer and answer[-1] in {"'", '"', " ", "`"}:
+            answer = answer[:-1]
+        while answer and answer[0] in {"'", '"', " ", "`"}:
+            answer = answer[1:]
+        answer = answer.strip('bot:')
+        if answer:
+            if answer[-1] != ".":
+                answer += "."
+        return answer
+    def _clean_chatgpt_answer(self,answer: str) -> str:
+        answer = answer.strip('bot:')
+        answer = answer.strip('Answer:')
+        answer = answer.strip('Réponse:')
+        while answer and answer[-1] in {"'", '"', " ", "`"}:
+            answer = answer[:-1]
+        return answer
+    def upload_doc(self,input_doc,include_images_,actual_page_start):
+        title = Doc.get_title(Doc,input_doc.name)
+        extension = title.split('.')[-1]
+        if extension and (extension == 'docx' or extension == 'pdf' or extension == 'html'):
+            open_ai_embedding = embedding_functions.OpenAIEmbeddingFunction(api_key=os.environ['OPENAI_API_KEY'], model_name="text-embedding-ada-002")
+            coll_name = "".join([c if c.isalnum() else "_" for c in title])
+            collection = self.client_db.get_or_create_collection(name=coll_name,embedding_function=open_ai_embedding)
+            if collection.count() == 0:
+                gr.Info("Please wait while your document is being analysed")
+                print("Database is empty")
+                doc = Doc(path=input_doc.name,include_images=include_images_,actual_first_page=actual_page_start)
+                # for block in doc.blocks:  #DEBUG PART
+                #     print(f"{block.index} : {block.content}")
+                retriever = Retriever(doc.container, collection=collection,llmagent=self.llm)
+            else:
+                print("Database is not empty")
+                retriever = Retriever(collection=collection,llmagent=self.llm)
+            self.retriever = retriever
+        else:
+            return False
+        return True

src/model/__init__.py ADDED Viewed

File without changes

src/model/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (178 Bytes). View file

src/model/__pycache__/block.cpython-311.pyc ADDED Viewed

Binary file (3.04 kB). View file

src/model/__pycache__/container.cpython-311.pyc ADDED Viewed

Binary file (5.77 kB). View file

src/model/__pycache__/doc.cpython-311.pyc ADDED Viewed

Binary file (4.05 kB). View file

src/model/__pycache__/paragraph.cpython-311.pyc ADDED Viewed

Binary file (2.64 kB). View file

src/model/block.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import math
+class Block:
+    def __init__(self, doc: str= '',title: str = '', content: str = '',
+                 index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
+        self.doc = doc
+        self.title = title
+        self.content = content
+        self.index = index
+        self.rank = rank
+        self.level = level
+        self.distance = distance
+    @property
+    def distance_str(self) -> str:
+        return format(self.distance, '.2f')
+    def separate_1_block_in_n(self, max_size=4500):
+        """
+        Separate a block in n blocks of equal size
+        """
+        content_length = len(self.content)
+        n = math.ceil(content_length / max_size)
+        block_size = content_length // n
+        new_blocks = []
+        for i in range(n):
+            start = i * block_size
+            end = (i + 1) * block_size if i < n - 1 else None
+            new_blocks.append(Block(doc=self.doc,
+                                    title=self.title + f"_part{i}",
+                                    content=self.content[start:end],
+                                    index=self.index + f"_{i}",
+                                    rank=self.rank,
+                                    level=self.level))
+        return new_blocks
+    def to_dict(self) -> {}:
+        block_dict = {'doc': self.doc,
+                      'title': self.title,
+                      'content': self.content,
+                      'index': self.index,
+                      'rank': self.rank,
+                      'level': self.level,
+                      'distance': self.distance}
+        return block_dict
+    def from_dict(self, block_dict: {}):
+        self.doc = block_dict['doc']
+        self.title = block_dict['title']
+        self.content = block_dict['content']
+        self.index = block_dict['index']
+        self.rank = block_dict['rank']
+        self.level = block_dict['level']
+        self.distance = block_dict['distance']
+        return self

src/model/container.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from .paragraph import Paragraph
+from .block import Block
+INFINITE = 99999
+class Container:
+    def __init__(self, paragraphs : [Paragraph], title : Paragraph=None, level: int = 0, index: [int] = None , father=None, id_ = 0):
+        if index is None:
+            index = []
+        self.level = level
+        self.title = title
+        self.paragraphs = []
+        self.children = []
+        self.index = index
+        self.father = father
+        self.id_ = int(str(1) + str(father.id_) + str(id_))
+        if paragraphs:
+            self.paragraphs, self.children = self.create_children(paragraphs, level, index)
+        self.containers = [self]
+        for child in self.children:
+            self.containers += child.containers
+        self.blocks = self.get_blocks()
+    def get_blocks(self):
+        block = Block(level=self.level, index=self.index)
+        if self.title:
+            self.title.text = self.title.text.replace('\r', '').replace('\n', '')
+            block.title = self.title.text
+            block.content = self.title.text + '/'
+        temp_father = self.father
+        while temp_father and type(temp_father) == Container:
+            if temp_father.title:
+                temp_father.title.text = temp_father.title.text.replace('\r', '').replace('\n', '')
+                block.content = temp_father.title.text + '/' + block.content
+            temp_father = temp_father.father
+        block.content += " :\n\n"
+        i = 0
+        for p in self.paragraphs:
+            if not p.blank:
+                i = 1
+                block.content += p.text
+        if i == 0:
+            blocks = []
+        else:
+            blocks = [block]
+        for child in self.children:
+            blocks += child.blocks
+        return blocks
+    def create_children(self, paragraphs, level, rank) -> ([], []):
+        """
+        creates children containers or directly attached content
+        and returns the list of containers and contents of level+1
+        :return:
+        [Content or Container]
+        """
+        attached_paragraphs = []
+        container_paragraphs = []
+        container_title = None
+        children = []
+        in_children = False
+        level = INFINITE
+        child_id = 0
+        while paragraphs:
+            p = paragraphs.pop(0)
+            if not in_children and not p.is_structure:
+                attached_paragraphs.append(p)
+            else:
+                in_children = True
+                if p.blank:
+                    continue
+                if p.is_structure and p.level <= level:  # if p is higher or equal in hierarchy
+                    if container_paragraphs or container_title:
+                        children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
+                        child_id += 1
+                    container_paragraphs = []
+                    container_title = p
+                    level = p.level
+                else:  # p is strictly lower in hierarchy
+                    container_paragraphs.append(p)
+        if container_paragraphs or container_title:
+            children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
+            child_id += 1
+        return attached_paragraphs, children
+    @property
+    def structure(self):
+        self_structure = {str(self.id_): {
+            'index': str(self.id_),
+            'canMove': True,
+            'isFolder': True,
+            'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children],
+            'canRename': True,
+            'data': {},
+            'level': self.level,
+            'rank': self.rank,
+            'title': self.title.text if self.title else 'root'
+        }}
+        paragraphs_structure = [p.structure for p in self.paragraphs]
+        structure = [self_structure] + paragraphs_structure
+        for child in self.children:
+            structure += child.structure
+        return structure

src/model/doc.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from src.model.container import Container
+from src.tools.index_creation import set_indexes
+from src.tools.reader_word import WordReader
+from src.tools.readers_pdf import Reader, Reader_illumio
+from src.tools.reader_html import Reader_HTML
+from src.model.paragraph import Paragraph
+class Doc:
+    def __init__(self, path='', include_images=True, actual_first_page=1):
+        self.title = self.get_title(path)
+        self.extension = self.title.split('.')[-1]
+        self.id_ = id(self)
+        self.path = path
+        paragraphs = []
+        if self.extension == 'docx':
+            paragraphs = WordReader(path).paragraphs
+        elif self.extension == 'pdf':
+            if "Illumio_Core_REST_API_Developer_Guide_23.3" in self.title:
+                paragraphs = Reader_illumio(path).paragraphs
+            else:
+                paragraphs = Reader(path, actual_first_page, include_images).paragraphs
+        else:
+            paragraphs = Reader_HTML(path).paragraphs
+        self.container = Container(paragraphs, father=self, title=self.set_first_container_title(self.title.split(".")[0],self.extension))
+        set_indexes(self.container)
+        self.blocks = self.get_blocks()
+    def get_title(self,path) -> str:
+        if '/' not in path and '\\' not in path:
+            res = path
+        if '/' in path:
+            res = path.split('/')[-1]
+        if '\\' in path:
+            res = path.split('\\')[-1]
+        return res
+    @property
+    def structure(self):
+        return self.container.structure
+    def get_blocks(self):
+        def from_list_to_str(index_list):
+            index_str = str(index_list[0])
+            for el in index_list[1:]:
+                index_str += '.' + str(el)
+            return index_str
+        blocks = self.container.blocks
+        for block in blocks:
+            block.doc = self.title
+            block.index = from_list_to_str(block.index)
+        return blocks
+    def set_first_container_title(self,title,extension) -> Paragraph:
+        if extension == 'pdf':
+            return Paragraph(text=title,font_style='title0',id_=0,page_id=0)
+        elif extension == 'docx':
+            return Paragraph(text=title,font_style='title0',id_=0,page_id=1)
+        else:
+            return Paragraph(text=title,font_style='h0',id_=0,page_id=1)
+"""
+    current_level = len(current_index)
+    if 0 < block.level:
+        if block.level == current_level:
+            current_index[-1] += 1
+        elif current_level < block.level:
+            current_index.append(1)
+        elif block.level < current_level:
+            current_index = current_index[:block.level]
+            current_index[-1] += 1
+        block.index = from_list_to_str(current_index)
+    else:
+        block.index = "0"
+"""

src/model/paragraph.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import string
+INFINITE = 10000
+class Paragraph:
+    def __init__(self, text : str, font_style : str, id_ : int, page_id : int):
+        self.font_style = font_style
+        self.id_ = int(str(2)+str(page_id)+str(id_))
+        self.page_id = page_id
+        self.level = self.handle_levels(font_style)
+        self.is_structure = self.level < INFINITE
+        self.text = text
+    @property
+    def blank(self):
+        """
+        checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored)
+        """
+        text = self.text.replace('\n', '')
+        return set(text).isdisjoint(string.ascii_letters)
+    def rearrange_paragraph(self):
+        """
+        rearrange the paragraph to have a better structure
+        """
+        if self.font_style == "code":
+            self.text = "\n\nCode :```\n" + self.text + "\n```\n\n"
+        elif self.font_style == "table":
+            self.text = "\n\nTable :\n" + self.text + "\n\n"
+        return self
+    def handle_levels(self, font_style : str):
+        if len(font_style) != 5 and 'title' in font_style:
+            return int(font_style[-1])
+        elif len(font_style) == 2 and font_style[0] == 'h':
+            return int(font_style[-1])
+        else:
+            return INFINITE

src/tools/__init__.py ADDED Viewed

File without changes

src/tools/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (178 Bytes). View file

src/tools/__pycache__/index_creation.cpython-311.pyc ADDED Viewed

Binary file (4.83 kB). View file

src/tools/__pycache__/llm.cpython-311.pyc ADDED Viewed

Binary file (11.7 kB). View file

src/tools/__pycache__/reader_html.cpython-311.pyc ADDED Viewed

Binary file (8.28 kB). View file

src/tools/__pycache__/reader_pdf_tools.cpython-311.pyc ADDED Viewed

Binary file (3.64 kB). View file

src/tools/__pycache__/reader_word.cpython-311.pyc ADDED Viewed

Binary file (4.72 kB). View file

src/tools/__pycache__/readers_pdf.cpython-311.pyc ADDED Viewed

Binary file (25.1 kB). View file

src/tools/__pycache__/retriever.cpython-311.pyc ADDED Viewed

Binary file (3.24 kB). View file

src/tools/__pycache__/table_converter.cpython-311.pyc ADDED Viewed

Binary file (1.03 kB). View file

src/tools/index_creation.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from src.model.container import Container
+INFINITE = 99999
+def create_dic_levels(c:Container,dict_of_levels : dict = {}):
+    if c.level == 0:
+        dict_of_levels[c.level] = [0]
+    for child in c.children:
+        if child.level not in dict_of_levels:
+            dict_of_levels[child.level] = [1 for _ in range(child.level)]
+        create_dic_levels(child, dict_of_levels)
+    if INFINITE in dict_of_levels.keys():
+        dict_of_levels[INFINITE] = [1]
+    return dict_of_levels
+def create_good_indexes(c:Container, dict_of_levels : dict):
+    actual_level = c.level
+    c.index = dict_of_levels[actual_level].copy()
+    actual_len = len(dict_of_levels[actual_level])
+    temp_update = dict_of_levels[actual_level][-1]
+    dict_of_levels[actual_level][-1] += 1
+    for i in dict_of_levels.values():
+        if len(i) > actual_len:
+            i[actual_len - 1] = temp_update
+    for child in c.children:
+        c_lvl = child.level
+        for i in dict_of_levels.values():
+            if len(i) > c_lvl:
+                i[c_lvl:] = [1 for _ in range(len(i[c_lvl:]))]
+        create_good_indexes(child, dict_of_levels)  # Apply the function recursively to all children
+def create_good_indexes_not_ordered_titles(c:Container, dict_of_levels : dict):
+    actual_level = c.level
+    c.index = dict_of_levels[actual_level].copy()
+    actual_len = len(dict_of_levels[actual_level])
+    temp_update = dict_of_levels[actual_level][-1]
+    dict_of_levels[actual_level][-1] += 1
+    for i in dict_of_levels.values():
+        if len(i) > actual_len:
+            i[actual_len - 1] = temp_update
+    for child in c.children:
+        c_lvl = child.level
+        for i in dict_of_levels.values():
+            if len(i) > c_lvl:
+                i[c_lvl:] = [1 for _ in range(len(i[c_lvl:]))]
+        create_good_indexes(child, dict_of_levels)  # Apply the function recursively to all children
+def set_good_block_indexes(c:Container):
+    for i in c.containers:
+        for b in i.blocks:
+            b.index = i.index
+def set_indexes(c:Container):
+    dict_levels = create_dic_levels(c)
+    myKeys = list(dict_levels.keys())
+    myKeys.sort()
+    dict_levels = {key: dict_levels[key] for key in myKeys}
+    if c.children and c.children[0] and (c.children[0].level > min(list(dict_levels.keys())[1:])):
+        c.children[0].level = min(list(dict_levels.keys())[1:])
+        create_good_indexes_not_ordered_titles(c, dict_levels)
+    else:
+        create_good_indexes(c, dict_levels)
+    set_good_block_indexes(c)

src/tools/llm.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import openai
+class LlmAgent:
+    def __init__(self, llm_model: str):
+        self.llm = llm_model
+    def generate_paragraph(self, query: str, context: {}, histo: [(str, str)], language='fr') -> str:
+        """generates the  answer"""
+        template = (f"You are a conversation bot designed to answer to the query from users."
+                    f"Your answer is based on the context delimited by triple backticks :\n ``` {context} ```\n"
+                    f"You are consistent and avoid redundancies with the rest of the initial conversation delimited by triple backticks :\n ``` {histo} ```\n"
+                    f"Your response shall be in {language} and shall be concise."
+                    f"You shall only provide the answer, nothing else before and after."
+                    f"Here is the query you are given :\n"
+                    f"``` {query} ```")
+        generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
+        res = generation.choices[0].message.content
+        print("****************")
+        print(res)
+        print("----")
+        return str(res)
+    def generate_paragraph_v2(self, query: str, context: {}, histo: [(str, str)], language='fr') -> str:
+        """generates the  answer"""
+        context_for_the_ai = (f"You are a conversation bot designed to answer to the query from users."
+                    f"Your answer is based on the context delimited by triple backticks :\n ``` {context} ```\n"
+                    f"You are consistent and avoid redundancies with the rest of the initial conversation delimited by triple backticks :\n ``` {histo} ```\n"
+                    f"Your response shall be in {language} and shall be concise.")
+        generation = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k", messages=[{"role":"system","content":context_for_the_ai},{"role":"user","content":query}])
+        res = generation.choices[0].message.content
+        print("****************")
+        print(res)
+        print("----")
+        return str(res)
+    def translate(self, text: str) -> str:
+        """translates"""
+        template = (f"Your task consists in translating in English the following text delimited by triple backticks: ``` {text} ```\n"
+                    f"If the text is already in English, just return it !\n"
+                    f"Your must not provide an answer to the text, just translate it.\n")
+        generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
+        res = generation.choices[0].message.content
+        print("****************")
+        print(res)
+        print("----TRANSLATE----")
+        return res
+    def translate_v2(self, text: str) -> str:
+        """translates"""
+        task = "Translate in english the text. If it is already in english, just return the text."
+        generation = openai.ChatCompletion.create(model="gpt-4", messages=[{"role":"system","content":task},{"role":"user","content":text}])
+        res = generation.choices[0].message.content
+        print("****************")
+        print(res)
+        print("----TRANSLATE V2----")
+        return res
+    def generate_answer(self, query: str, answer: str, histo: str, context: str,language : str) -> str:
+        """provides the final answer in {language} based on the initial query and the answer in english"""
+        template = (f"Your task consists in translating the answer in {language}, if its not already the case, to the query "
+                    f"delimited by triple backticks: ```{query}``` \n"
+                    f"You don't add new content to the answer but: "
+                    f"1 You can use some vocabulary from the context delimited by triple backticks:\n"
+                    f"```{context}```\n"
+                    f"2 You are consistent and avoid redundancies with the rest of the initial"
+                    f"conversation delimited by triple backticks: ```{histo}```\n"
+                    f"Your response shall respect the following format:<response>\n"
+                    f"Here is the answer you are given in {language}:"
+                    f"{answer}")
+        generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
+        res = generation.choices[0].message.content
+        print("****************")
+        print(res)
+        print("----")
+        return str(res).strip()
+    def summarize_paragraph(self, prompt : str, title_doc : str = '',title_para : str = ''):
+        max_tokens = 700
+        """summarizes the paragraph"""
+        template = (f"Your task consists in summarizing the paragraph of the document untitled ```{title_doc}```."
+                    f"The paragraph title is ```{title_para}```."
+                    f"Your response shall be concise and shall respect the following format:"
+                    f"<summary>"
+                    f"If you see that the summary that you are creating will not respect ```{max_tokens}``` tokens, find a way to make it shorter."
+                    f"The paragraph you need to summarize is the following :"
+                    f"{prompt}")
+        generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
+        res = generation.choices[0].message.content
+        print("****************")
+        print(res)
+        print("----")
+        return str(res).strip()
+    def summarize_paragraph_v2(self, prompt : str, title_doc : str = '', title_para : str = ''):
+        max_tokens = 850
+        location_of_the_paragraph = prompt.split(" :")[0]
+        """summarizes the paragraph"""
+        task = (f"Your task consists in summarizing in English the paragraph of the document untitled ```{title_doc}``` located in the ```{location_of_the_paragraph}``` section of the document."
+                    f"The paragraph title is ```{title_para}```."
+                    f"Your response shall be concise and shall respect the following format:"
+                    f"<summary>"
+                    f"If you see that the summary that you are creating will not respect ```{max_tokens}``` tokens, find a way to make it shorter.")
+        generation = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k", messages=[{"role":"system","content":task},{"role":"user","content":prompt}])
+        res = generation.choices[0].message.content
+        print("****************")
+        print(res)
+        print("----")
+        return str(res).strip()
+    def transform_paragraph_into_question(self, prompt : str, title_doc : str = '',title_para : str = '') -> (str, str):
+        max_tokens = 150
+        prompt_template=(f"Your job is to create two questions about a paragraph of a document untitled ```{title_doc}```."
+        f"The paragraph title is ```{title_para}```."
+        f"If you see that the questions that you are creating will not respect ```{max_tokens}``` tokens, find a way to make them shorter."
+        f"If you can't create a question about the paragraph, just rephrase ```{title_para}``` so that it becomes a question."
+        f"Your response shall contains two questions, shall be concise and shall respect the following format:"
+        f"`<question1>!=;<question2>`"
+        f"You should not answer to the questions, just create them. Moreover, you shall include the title of the paragraph in the questions."
+        f"The paragraph you need to create questions about is the following :"
+        f"{prompt}")
+        generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":prompt_template}])
+        res = generation.choices[0].message.content
+        print("****************")
+        res = str(res).split("!=;")
+        if len(res) == 1:
+            return (res[0],"")
+        elif len(res) == 2:
+            return (res[0],res[1])
+        else:
+            return ("","")
+    def detect_language(self, text: str) -> str:
+        """detects the language"""
+        template = (f"Your task consists in detecting the language of the last question or sentence of the text."
+                    f"You should only give the two letters code of the language detected, nothing else."
+                    f"Here is the text you are given delimited by triple backticks : ```{text}```")
+        generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
+        res = generation.choices[0].message.content
+        return str(res).strip()
+    def detect_language_v2(self, text: str) -> str:
+        """detects the language"""
+        task = (f"Your task consists in detecting the language of the last question or sentence of the text."
+                f"You should only give the two letters code of the language detected, nothing else.")
+        generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"system","content":task},{"role":"user","content":text}])
+        res = generation.choices[0].message.content
+        return str(res).strip()

src/tools/pretty_print.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from src.model.paragraph import Paragraph
+from src.model.container import Container
+#function that pretty prints the paragraphs
+def pretty_printer_paragraphs(paragraphs):
+    for p in paragraphs:
+        if (p.font_style == "title1"):
+            print(f"Titre 1 {p.text}")
+        elif (p.font_style == "title2"):
+            print(f"---> Titre 2 {p.text}")
+        elif (p.font_style == "title3"):
+            print(f"-------> Titre 3 {p.text}")
+        elif (p.font_style == "title4"):
+            print(f"-----------> Titre 4 {p.text}")
+        elif (p.font_style == "content"):
+            print(f"---------------> {p.text}")
+        elif (p.font_style == "code"):
+            print(f"----------code------------> {p.text}")
+        elif (p.font_style == "table"):
+            print(f"----------table------------> {p.text}")
+def pretty_print_container_structure(container):
+    if container.title:
+        print(f"{'-'*container.level} {container.title.text}")
+    for p in container.paragraphs:
+        print(f"{'-'*container.level} {p.text}")
+    for c in container.children:
+        pretty_print_container_structure(c)
+def print_all_block_indexes(container):
+    for b in container.blocks:
+        print(f'{b.index} : {b.title if b.title else ""}')

src/tools/reader_html.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from pyquery import PyQuery as pq
+from src.model.paragraph import Paragraph
+from bs4 import BeautifulSoup
+from src.tools.readers_pdf import Reader_illumio
+from src.tools.table_converter import table_converter
+class Reader_HTML:
+    def __init__(self, path):
+        self.path = path
+        self.paragraphs = self.read_html_2(path)
+    #without beautifulsoup but doesn't work fine
+    def read_html(self, path):
+        with open(path, 'r') as html_file:
+            doc = pq(html_file.read())
+        # Remove script and style elements
+        doc('script').remove()
+        doc('style').remove()
+        paragraphs = []
+        for index, elem in enumerate(doc('*')):
+            # Check if the element is a leaf (does not contain other elements)
+            if not pq(elem).find('*'):
+                text = pq(elem).text().strip()
+                if text:
+                    paragraphs.append(Paragraph(text=text, font_style=elem.tag, id_ = index, page_id=1))
+        return paragraphs
+    #with beautifulsoup
+    def read_html_2(self,path):
+        HTMLFile = open(path, "r")
+        # Reading the file
+        reader = HTMLFile.read()
+        paragraphs = []
+        # Creating a BeautifulSoup object and specifying the parser
+        S = BeautifulSoup(reader, 'html.parser')
+        for tag in S(['style', 'script', 'footer', 'header', 'nav', 'aside', 'form']):
+            tag.decompose()
+        # Get all elements that do not contain other elements
+        leaf_elements = [elem for elem in S.body.descendants if elem.name is not None and not elem.find_all()]
+        paragraphs = []
+        for index, elem in enumerate(leaf_elements):
+            text = elem.get_text(strip=True, separator='\n')
+            if text:
+                p = Paragraph(text=text, font_style=elem.name, id_ = index, page_id=1)
+                paragraphs.append(p)
+        paragraphs = self.concatenate_paragraphs_with_same_font_style(paragraphs)
+        paragraphs = [p.rearrange_paragraph() for p in paragraphs]
+        return paragraphs
+    def concatenate_paragraphs_with_same_font_style(self,paragraphs: [Paragraph]):
+        i = 0
+        while i < len(paragraphs)-1:
+            if paragraphs[i].font_style == "th":
+                paragraphs = self.create_table(paragraphs,i)
+                i += 1
+            elif paragraphs[i].font_style == "li":
+                paragraphs,i = self.create_list(paragraphs,i)
+                i += 1
+            elif paragraphs[i].font_style == paragraphs[i+1].font_style:
+                paragraphs[i].text += "\n" + paragraphs[i+1].text
+                paragraphs.pop(i+1)
+            else:
+                i += 1
+        return paragraphs
+    def create_table(self, paragraphs, i: int):
+        table = []
+        titles = []
+        content = []
+        while i < len(paragraphs) and paragraphs[i].font_style == "th":
+            titles.append(paragraphs[i].text)
+            paragraphs.pop(i)
+        table.append(titles)
+        length = len(titles)
+        temp = 0
+        while i < len(paragraphs) and paragraphs[i].font_style == "td":
+            if temp == length:
+                temp = 0
+                content.append(paragraphs[i].text)
+                table.append(content)
+                content = []
+            else:
+                content.append(paragraphs[i].text)
+                paragraphs.pop(i)
+                temp += 1
+        table.append(content)
+        paragraphs.insert(i,Paragraph(table_converter(table),font_style="table",id_=i,page_id=1))
+        return paragraphs
+    def create_list(self, paragraphs, i: int):
+        list_content = []
+        while i < len(paragraphs) and paragraphs[i].font_style in ["ul", "ol", "li"]:
+            if paragraphs[i].font_style == "li":
+                list_content.append(paragraphs[i].text)
+                paragraphs.pop(i)
+            elif paragraphs[i].font_style in ["ul", "ol"]:
+                sublist, i = self.create_list(paragraphs, i+1)
+                list_content.append(sublist)
+            else:
+                i += 1
+        list_paragraph = Paragraph(text=self.format_list(list_content), font_style="list", id_=i, page_id=1)
+        paragraphs.insert(i, list_paragraph)
+        return paragraphs, i
+    def format_list(self,list_content):
+        res = ""
+        for i in range(len(list_content)):
+            if type(list_content[i]) == str:
+                res += f"{i+1}. {list_content[i]}\n"
+            else:
+                res += f"{i+1}. {self.format_list(list_content[i])}\n"
+        return res

src/tools/reader_pdf_tools.py ADDED Viewed

	@@ -0,0 +1,56 @@

+def flatten(S):
+    if S == []:
+        return S
+    if isinstance(S[0], list):
+        return flatten(S[0]) + flatten(S[1:])
+    return S[:1] + flatten(S[1:])
+def keep_int_and_floats_in_list(S):
+    i = 0
+    while i < len(S):
+        if isinstance(S[i], str):
+            S.pop(i)
+        else:
+            i+=1
+    return S
+def group_formats(formats : list) -> list:
+    #create a list of lists of formats that are close to each other (0.5 difference)
+    formats = sorted(formats)
+    groups = []
+    current_group = []
+    current_format = formats[0]
+    for format in formats:
+        if format - current_format <= 0.20:
+            current_group.append(format)
+        else:
+            groups.append(current_group)
+            current_group = [format]
+        current_format = format
+    groups.append(current_group)
+    return groups
+def find_max_list(list):
+    list_len = [len(i) for i in list]
+    return len(list) - 1 - list_len[::-1].index(max(list_len))
+def find_good_key_in_dict(dict : dict, value) -> str:
+    for key in dict.keys():
+        if value in dict[key]:
+            return key
+    return None
+def create_dict_and_assign_styles_from_format(formats : list) -> dict:
+    #create a dictionary with the format as key and the style as value
+    styles = {}
+    content_format_index = find_max_list(formats)
+    i = 0
+    for l in formats[:content_format_index]:
+        formats[content_format_index - i] += l
+        del formats[formats.index(l)]
+        i+=1
+    number_of_styles = len(formats)
+    styles["content"] = sorted(list(set(formats[0])))
+    for i in range(1,len(formats)):
+        styles["title"+str(number_of_styles-i)] = sorted(list(set(formats[i])))
+    return styles

src/tools/reader_word.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import docx
+import os
+# sys.path.append('path to app')
+# import docx
+# import os
+# import sys
+from src.model.paragraph import Paragraph
+class WordReader:
+    def __init__(self, path):
+        self.path = path
+        self.paragraphs = self.get_word_paragraphs()
+    def get_word_paragraphs(self):
+        """
+        Fetches paragraphs from a Word document.
+        Returns:
+            list: List of Paragraph objects from the document.
+        """
+        if not os.path.exists(self.path):
+            raise FileNotFoundError(f"The file {self.path} does not exist.")
+        try:
+            doc = docx.Document(self.path)
+            paragraphs = self.to_paragraph_objects(doc.paragraphs)  # Convert to  Paragraph objects
+            return paragraphs
+        except Exception as e:
+            raise ValueError(f"Error reading the .docx file. Original error: {str(e)}")
+    def determine_style(self, paragraph):
+        """
+        Determines the style of the paragraph based on its attributes.
+        Returns:
+            str: Style of the paragraph.
+        """
+        # Check for heading styles first
+        if paragraph.style.name.startswith('Heading 1'):
+            return "title1"
+        elif paragraph.style.name.startswith('Heading 2'):
+            return "title2"
+        elif paragraph.style.name.startswith('Heading 3'):
+            return "title3"
+        elif paragraph.style.name.startswith('Heading 4'):
+            return "title4"
+        elif paragraph.style.name.startswith('Heading 5'):
+            return "title5"
+        # If not a heading, check the runs within the paragraph
+        for run in paragraph.runs:
+            font = run.font
+            fontname = font.name
+            size = font.size
+            # Convert size to points (from twips)
+            if size:
+                size_in_points = size.pt
+                # Map based on font name and size as in the PDF reader
+                if fontname == "XFQKGD+Consolas":
+                    return "code"
+                elif (size_in_points >= 9 and size_in_points < 11.5) or fontname == "Wingdings-Regular":
+                    return "content"
+        # If none of the above conditions match, default to 'content'
+        return "content"
+    def to_paragraph_objects(self, doc_paragraphs):
+        """
+        Convert docx paragraphs to Paragraph objects for further processing.
+        """
+        paragraph_objects = []
+        for idx, paragraph in enumerate(doc_paragraphs):
+            style = self.determine_style(paragraph)
+            # Assuming page_id is always 1 for simplicity, change as needed.
+            p_obj = Paragraph(text=paragraph.text, font_style=style, id_=idx, page_id=1)
+            paragraph_objects.append(p_obj)
+            paragraphs = self.rearrange_paragraphs(paragraph_objects)
+        return paragraphs
+    def rearrange_paragraphs(self, paragraphs : [Paragraph]):
+        #associate paragraphs with the same font style
+        i = 0
+        while i < len(paragraphs):
+            paragraphs[i] = paragraphs[i].rearrange_paragraph()
+            i+=1
+        return paragraphs
+    def display_paragraphs(self):
+        """
+        Prints the paragraphs from the document to the console.
+        """
+        for paragraph in self.paragraphs:
+            print(paragraph.text)
+            print('-' * 40)  # separator for clarity
+# if __name__ == '__main__':
+#     reader = WordReader("Illumio_Core_REST_API_Developer_Guide_23.3.docx")
+#     reader.display_paragraphs()

src/tools/readers_pdf.py ADDED Viewed

	@@ -0,0 +1,428 @@

+import json
+import PyPDF2
+# To analyze the PDF layout and extract text
+from pdfminer.high_level import extract_pages, extract_text
+from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
+# To extract text from tables in PDF
+import pdfplumber
+# To extract the images from the PDFs
+from PIL import Image
+from pdf2image import convert_from_path
+# To perform OCR to extract text from images
+import pytesseract
+# To remove the additional created files
+import os
+import pdfplumber as pdfp
+from src.model.paragraph import Paragraph
+from src.tools.table_converter import table_converter
+from src.tools.reader_pdf_tools import *
+import gradio as gr
+def get_style_of_line(size : float, fontname : str):
+    if fontname == "XFQKGD+Consolas":
+        return "code"
+    elif (size >= 9 and size < 11.5) or fontname == "CRRYJU+Wingdings-Regular":
+        return "content"
+    elif size >= 11.5 and size <= 12.7:
+        return "title5"
+    elif size >= 12.8 and size <= 13.5:
+        return "title4"
+    elif size > 13.5 and size <= 15.5:
+        return "title3"
+    elif size > 15.5 and size <= 18.5:
+        return "title2"
+    elif size > 19 and size < 30:
+        return "title1"
+    else:
+        return "unknown"
+class Reader:
+    def __init__(self, path,actual_first_page_=0, include_images=True):
+        self.path = path
+        self.paragraphs = self.pdf_manager(path, actual_first_page_, include_images=include_images)
+    def most_occuring_fonts(self, line_formats : list):
+        if line_formats != []:
+            min_freq = 3
+            font_size_freq = {i: line_formats.count(i) for i in set(line_formats) if isinstance(i, float)}
+            most_occuring_font_sizes = [size for size, freq in font_size_freq.items() if freq >= min_freq]
+            line_formats = [i for i in line_formats if i in most_occuring_font_sizes or isinstance(i, str)]
+        return line_formats
+    def text_extraction(self,element):
+        # Extracting the text from the in line text element
+        line_text = element.get_text()
+        # Find the formats of the text
+        # Initialize the list with all the formats appeared in the line of text
+        line_formats = []
+        for text_line in element:
+            if isinstance(text_line, LTTextContainer):
+                # Iterating through each character in the line of text
+                for character in text_line:
+                    if isinstance(character, LTChar):
+                        # Append the font name of the character
+                        line_formats.append(character.fontname)
+                        # Append the font size of the character
+                        line_formats.append(character.size)
+        #find the most occuring font size and keep it. If there are more than one, keep all of them.
+        line_formats = self.most_occuring_fonts(line_formats)
+        # Find the unique font sizes and names in the line and delete the None values
+        format_per_line = list(set(line_formats))
+        # Return a tuple with the text in each line along with its format
+        return (line_text, format_per_line)
+    # Extracting tables from the page
+    def extract_table(self, pdf_path, page_num, table_num):
+        # Open the pdf file
+        pdf = pdfplumber.open(pdf_path)
+        # Find the examined page
+        table_page = pdf.pages[page_num]
+        # Extract the appropriate table
+        table = table_page.extract_tables()[table_num]
+        return table
+    # Create a function to check if the element is in any tables present in the page
+    def is_element_inside_any_table(self, element, page ,tables):
+        x0, y0up, x1, y1up = element.bbox
+        # Change the cordinates because the pdfminer counts from the botton to top of the page
+        y0 = page.bbox[3] - y1up
+        y1 = page.bbox[3] - y0up
+        for table in tables:
+            tx0, ty0, tx1, ty1 = table.bbox
+            if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
+                return True
+        return False
+    # Function to find the table for a given element
+    def find_table_for_element(self, element, page ,tables):
+        x0, y0up, x1, y1up = element.bbox
+        # Change the cordinates because the pdfminer counts from the botton to top of the page
+        y0 = page.bbox[3] - y1up
+        y1 = page.bbox[3] - y0up
+        for i, table in enumerate(tables):
+            tx0, ty0, tx1, ty1 = table.bbox
+            if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
+                return i  # Return the index of the table
+        return None
+    # Create a function to crop the image elements from PDFs
+    def crop_image(self, element, pageObj):
+        # Get the coordinates to crop the image from PDF
+        [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
+        # Crop the page using coordinates (left, bottom, right, top)
+        pageObj.mediabox.lower_left = (image_left, image_bottom)
+        pageObj.mediabox.upper_right = (image_right, image_top)
+        # Save the cropped page to a new PDF
+        cropped_pdf_writer = PyPDF2.PdfWriter()
+        cropped_pdf_writer.add_page(pageObj)
+        # Save the cropped PDF to a new file
+        with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
+            cropped_pdf_writer.write(cropped_pdf_file)
+    # Create a function to convert the PDF to images
+    def convert_to_images(self, input_file,):
+        images = convert_from_path(input_file)
+        image = images[0]
+        output_file = 'PDF_image.png'
+        image.save(output_file, 'PNG')
+    # Create a function to read text from images
+    def image_to_text(self, image_path):
+        # Read the image
+        img = Image.open(image_path)
+        # Extract the text from the image
+        text = pytesseract.image_to_string(img)
+        return text
+    def pdf_manager(self, pdf_path, actual_first_page=0, include_images=True):
+        # create a PDF file object
+        pdfFileObj = open(pdf_path, 'rb')
+        # create a PDF reader object
+        pdfReaded = PyPDF2.PdfReader(pdfFileObj)
+        number_of_pages = len(pdfReaded.pages)
+        # Create the dictionary to extract text from each image
+        text_per_page = {}
+        # Create a boolean variable for image detection
+        image_flag = False
+        actual_first_page = int(actual_first_page)
+        if actual_first_page > number_of_pages:
+            gr.Warning("The number of pages you want to skip is greater than the number of pages in the document. We will extract all the pages.")
+            page_numbers = None
+        else:
+            page_numbers = [i for i in range(actual_first_page - 1,number_of_pages)]
+        # We extract the pages from the PDF
+        for pagenum, page in enumerate(extract_pages(pdf_path,page_numbers=page_numbers)):
+            # Initialize the page object
+            pagenum = page_numbers[pagenum] if page_numbers else pagenum
+            pageObj = pdfReaded.pages[pagenum]
+            # Initialize the variables needed for the text extraction from the page
+            page_text = []
+            line_format = []
+            text_from_images = []
+            text_from_tables = []
+            page_content = []
+            # Initialize the number of the examined tables
+            table_in_page= -1
+            # Open the pdf file
+            pdf = pdfplumber.open(pdf_path)
+            # Find the examined page
+            page_tables = pdf.pages[pagenum]
+            # Find the number of tables in the page
+            tables = page_tables.find_tables()
+            if len(tables)!=0:
+                table_in_page = 0
+            # Extracting the tables of the page
+            for table_num in range(len(tables)):
+                # Extract the information of the table
+                table = self.extract_table(pdf_path, pagenum, table_num)
+                # Convert the table information in structured string format
+                table_string = table_converter(table)
+                # Append the table string into a list
+                text_from_tables.append(table_string)
+            # Find all the elements
+            page_elements = [(element.y1, element) for element in page._objs]
+            # Sort all the element as they appear in the page
+            page_elements.sort(key=lambda a: a[0], reverse=True)
+            # Find the elements that composed a page
+            for i,component in enumerate(page_elements):
+                # Extract the element of the page layout
+                element = component[1]
+                # Check the elements for tables
+                if table_in_page == -1:
+                    pass
+                else:
+                    if self.is_element_inside_any_table(element, page ,tables):
+                        table_found = self.find_table_for_element(element,page ,tables)
+                        if table_found == table_in_page and table_found != None:
+                            page_content.append(text_from_tables[table_in_page])
+                            page_text.append('table')
+                            line_format.append('table')
+                            table_in_page+=1
+                        # Pass this iteration because the content of this element was extracted from the tables
+                        continue
+                if not self.is_element_inside_any_table(element,page,tables):
+                    # Check if the element is text element
+                    if isinstance(element, LTTextContainer):
+                        # Use the function to extract the text and format for each text element
+                        (line_text, format_per_line) = self.text_extraction(element)
+                        # Append the text of each line to the page text
+                        page_text.append(line_text)
+                        # Append the format for each line containing text
+                        line_format.append(format_per_line)
+                        page_content.append(line_text)
+                    #Check the elements for images
+                    if include_images:
+                        if isinstance(element, LTFigure):
+                            # Crop the image from PDF
+                            self.crop_image(element, pageObj)
+                            # Convert the croped pdf to image
+                            self.convert_to_images('cropped_image.pdf')
+                            # Extract the text from image
+                            image_text = self.image_to_text('PDF_image.png')
+                            text_from_images.append(image_text)
+                            page_content.append(image_text)
+                            # Add a placeholder in the text and format lists
+                            page_text.append('image')
+                            line_format.append('image')
+                            # Update the flag for image detection
+                            image_flag = True
+            # Create the key of the dictionary
+            dctkey = 'Page_'+str(pagenum)
+            # Add the list of list as value of the page key
+            text_per_page[dctkey]= [page_text, line_format, text_from_images, text_from_tables, page_content]
+        # Close the pdf file object
+        pdfFileObj.close()
+        # Create a list of formats for all the pages
+        formats = []
+        for p in text_per_page.values():
+            formats.append(p[1])
+        #flatten the list of lists
+        formats = flatten(formats)
+        #keep only the font sizes in the list
+        formats = keep_int_and_floats_in_list(formats)
+        #group the formats in lists of similar formats
+        grouped_formats = group_formats(formats)
+        #create a dictionary with the format as key and the style as value
+        styles = create_dict_and_assign_styles_from_format(grouped_formats)
+        #display the result on a separate file as a JSON with some indentation for better visualization
+        with open(file="styles.txt", mode='a') as fp:
+            if fp.tell() == 0:
+                fp.write('Document title: ' + pdf_path.split('/')[-1] + '\n') if '/' in pdf_path else fp.write('Document title: ' + pdf_path.split('\\')[-1] + '\n')
+            else:
+                fp.write('\nDocument title: ' + pdf_path.split('/')[-1] + '\n') if '/' in pdf_path else fp.write('\nDocument title: ' + pdf_path.split('\\')[-1] + '\n')
+            json.dump(styles, fp, indent=4)
+        # Delete the additional files created if image is detected
+        if image_flag:
+            os.remove('cropped_image.pdf')
+            os.remove('PDF_image.png')
+        #beginning of the paragraph extraction
+        paragraphs = []
+        for index, page in enumerate(text_per_page.values()):
+            content_format = page[1]
+            j = 0
+            while j+1 < len(content_format):
+                actual_format = content_format[j]
+                n_of_fontsizes = len(list(i for i in actual_format if isinstance(i, int) or isinstance(i, float)))
+                if n_of_fontsizes > 1:
+                    actual_format = max(keep_int_and_floats_in_list(actual_format))
+                    actual_format = find_good_key_in_dict(styles,actual_format)
+                elif n_of_fontsizes == 1:
+                    actual_format = keep_int_and_floats_in_list(actual_format)[0]
+                    actual_format = find_good_key_in_dict(styles,actual_format)
+                elif n_of_fontsizes == 0 and actual_format == "table":
+                    actual_format = "table"
+                else:
+                    actual_format = "content"
+                #try to find the good format if the current result seems wrong
+                #changes depending on the document
+                if len(page[4][j]) > 150 and "title" in actual_format:
+                    actual_format = "content"
+                paragraph = Paragraph(text=page[4][j],font_style=actual_format,id_=j,page_id=index)
+                paragraphs.append(paragraph)
+                j+=1
+        paragraphs = self.concatenate_paragraphs(paragraphs, pdf_path.split('/')[-1]) if '/' in pdf_path else self.concatenate_paragraphs(paragraphs, pdf_path.split('\\')[-1])
+        return paragraphs
+    def concatenate_paragraphs(self, paragraphs, doc_title):
+        concatenated_paragraphs = []
+        i = 0
+        actual_page_id = paragraphs[0].page_id
+        while i < len(paragraphs):
+            p = paragraphs[i]
+            if p.blank or "REST API Developer Guide 23.3" in p.text or "x! illumio" in p.text:
+                i+=1
+                continue
+            if (p.page_id != actual_page_id) and doc_title == "Illumio_Core_REST_API_Developer_Guide_23.3.pdf" and (not p.font_style == "table" and not "title" in p.font_style):
+                i+=2
+                actual_page_id = p.page_id
+                continue
+            if not concatenated_paragraphs:
+                concatenated_paragraphs.append(p)
+            elif p.font_style != concatenated_paragraphs[-1].font_style:
+                if (p.font_style == "table" and concatenated_paragraphs[-1].font_style == "content") \
+                    or (p.font_style == "content" and concatenated_paragraphs[-1].font_style == "table"):
+                    concatenated_paragraphs[-1].text += '\n' + p.text
+                else:
+                    concatenated_paragraphs.append(p)
+            else:
+                if "title" in p.font_style:
+                    concatenated_paragraphs[-1].text += ' : ' + p.text
+                    concatenated_paragraphs[-1].text = concatenated_paragraphs[-1].text.replace('\n','').replace('\r','')
+                else:
+                    concatenated_paragraphs[-1].text += '\n' + p.text
+            i+=1
+        return concatenated_paragraphs
+class Reader_illumio:
+    def __init__(self, path):
+        self.path = path
+        self.paragraphs = self.get_pdf_paragraphs(path)
+    def skip_header(self, dictionary):
+        i = 0
+        if "Illumio_Core_REST_API_Developer_Guide_23.3" in self.path and not (dictionary[i]["chars"][0]["size"] > 19 and dictionary[i]["chars"][0]["size"] < 30):
+            i+=2
+        return i
+    def get_pdf_paragraphs(self,path):
+        pdf_to_read = self.extract_all_lines_from_the_doc(path)
+        paragraphs = []
+        j = 0
+        while j < len(pdf_to_read):
+            dictionary = pdf_to_read[j]["content"]
+            tables = pdf_to_read[j]["tables"]
+            i = self.skip_header(dictionary)
+            table_count = 0
+            while i < len(dictionary):
+                # print(f"{dictionary[i]['chars'][0]}")
+                if(dictionary[i]["text"].startswith("RESTAPIDeveloperGuide")):
+                    i+=1
+                    continue
+                if (self.check_if_already_in_table(dictionary[i]['chars'][0],tables) == False):
+                    p = Paragraph(dictionary[i]["text"],font_style=get_style_of_line(dictionary[i]["chars"][0]["size"],dictionary[i]["chars"][0]["fontname"]),id_=i,page_id=pdf_to_read[j]["page_number"])
+                    if(i != len(dictionary)-1):
+                        while((dictionary[i+1]["chars"][0]["size"] == dictionary[i]["chars"][-1]["size"] and dictionary[i+1]["chars"][0]["fontname"] == dictionary[i]["chars"][-1]["fontname"]) and self.check_if_already_in_table(dictionary[i+1]['chars'][0],tables) == False):
+                            p.text += " " + dictionary[i+1]["text"]
+                            i += 1
+                    else:
+                        p.text = dictionary[i]["text"]
+                    #print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
+                    i += 1
+                    # print(f'{p.page_id} : {p.font_style} ->>>>> {p.text}')
+                    paragraphs.append(p)
+                else:
+                    p = Paragraph(table_converter(tables[table_count].extract()),font_style="table",id_=i,page_id=pdf_to_read[j]["page_number"])
+                    paragraphs.append(p)
+                    i = self.skip_out_table(dictionary,i,tables[table_count])
+                    table_count += 1
+            j += 1
+        paragraphs = self.rearrange_paragraphs(paragraphs)
+        return paragraphs
+    def rearrange_paragraphs(self, paragraphs : [Paragraph]):
+        #associate paragraphs with the same font style
+        i = 0
+        while i < len(paragraphs):
+            paragraphs[i] = paragraphs[i].rearrange_paragraph()
+            i+=1
+        return paragraphs
+    def extract_all_lines_from_the_doc(self,path):
+        lines_of_doc = []
+        with open(path, 'rb') as f:
+            reader = pdfp.PDF(f)
+            if "Illumio_Core_REST_API_Developer_Guide_23.3" in path:
+                skip_table_of_contents = reader.pages[8:]
+                j = 0
+                while j < len(skip_table_of_contents):
+                    lines_of_doc.append({"page_number": j+9, "content": skip_table_of_contents[j].extract_text_lines(), "tables": skip_table_of_contents[j].find_tables()})
+                    j += 1
+            else:
+                for page in reader.pages:
+                    lines_of_doc.append({"page_number": page.page_number, "content": page.extract_text_lines(), "tables": page.find_tables()})
+        return lines_of_doc
+    def check_if_already_in_table(self,line,tables):
+        for table in tables:
+            if table.bbox[1] <= line["top"] <= table.bbox[3]:
+                return True
+        return False
+    def skip_out_table(self,dictionary,index,table):
+        i = index
+        while i < len(dictionary):
+            if self.check_if_already_in_table(dictionary[i]['chars'][0],tables=[table]) == True:
+                i += 1
+            else:
+                break
+        return i

src/tools/retriever.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from src.model.block import Block
+from src.model.doc import Doc
+from src.tools.llm import LlmAgent
+import gradio as gr
+class Retriever:
+    def __init__(self, doc : Doc = None, collection = None, llmagent : LlmAgent = None):
+        if doc != None:
+            blocks_good_format: [Block] = doc.blocks
+            self.collection = collection
+            gr.Info("Please wait while the database is being created")
+            for block in blocks_good_format:
+                if len(block.content) > 4500:
+                    new_blocks = block.separate_1_block_in_n(max_size=4500)
+                    for new_block in new_blocks:
+                        summary = llmagent.summarize_paragraph_v2(prompt=new_block.content,title_doc=doc.title,title_para=block.title)
+                        if "<summary>" in summary:
+                            summary = summary.split("<summary>")[1]
+                        self.collection.add(
+                            documents=[summary],
+                            ids=[new_block.index],
+                            metadatas=[new_block.to_dict()]
+                        )
+                else:
+                    summary = llmagent.summarize_paragraph_v2(prompt=block.content,title_doc=doc.title,title_para=block.title)
+                    if "<summary>" in summary:
+                        summary = summary.split("<summary>")[1]
+                    self.collection.add(
+                        documents=[summary],
+                        ids=[block.index],
+                        metadatas=[block.to_dict()]
+                    )
+            gr.Info(f"The collection {collection.name} has been added to the database")
+        else:
+            self.collection = collection
+    def similarity_search(self, queries: str) -> {}:
+        res = self.collection.query(query_texts=queries,n_results=6)
+        block_dict_sources = res['metadatas'][0]
+        distances = res['distances'][0]
+        blocks = []
+        for bd, d in zip(block_dict_sources, distances):
+            b = Block().from_dict(bd)
+            b.distance = d
+            blocks.append(b)
+        return blocks

src/tools/table_converter.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Convert table into appropriate fromat
+def table_converter(table):
+    table_string = ''
+    # Iterate through each row of the table
+    for row_num in range(len(table)):
+        row = table[row_num]
+        # Remove the line breaker from the wrapted texts
+        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
+        # Convert the table into a string
+        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
+    # Removing the last line break
+    table_string = table_string[:-1]
+    return table_string

src/view/__pycache__/view.cpython-311.pyc ADDED Viewed

Binary file (17.5 kB). View file

src/view/view.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import gradio as gr
+from src.control.control import Chatbot
+from chromadb.utils import embedding_functions
+import os
+def run(ctrl: Chatbot, config: {}):
+    with gr.Blocks() as qna:
+        with gr.Row():
+            with gr.Column():
+                pass
+            with gr.Column(scale=10):
+                gr.Markdown(config['title'])
+                page_start_warning = gr.Markdown("<center>⚠️ If your document starts with a front cover and/or a table of contents, please enter the page number of the ⚠️ first page with real content.<center/>")
+                actual_page_start = gr.Number(
+                    label="Start page (default = 1)",
+                    visible=True,
+                    interactive=True,
+                    container=True,
+                    value=1,
+                )
+                include_images_btn = gr.Checkbox(
+                    label="Analyse text from images. This option is definitely slower, particularly on big documents. (ONLY for .pdf)",
+                    value=False,
+                    visible=True,
+                    container=True,
+                )
+                input_doc_comp = gr.File(
+                    label="Upload a file",
+                    scale=1,
+                    min_width=100,
+                )
+                histo_text_comp = gr.Chatbot(
+                    visible=False,
+                    value=[],
+                )
+                input_text_comp = gr.Textbox(
+                    label="",
+                    lines=1,
+                    visible=False,
+                    max_lines=3,
+                    interactive=True,
+                    placeholder="Posez votre question ici",
+                )
+                clear_btn = gr.Button("Clear Chat", visible=False)
+                input_example_comp = gr.Radio(
+                    label="Examples",
+                    choices=config['examples'].values(),
+                    value="",
+                    visible=False,
+                )
+                source_text_comp = []
+                for i in range(4):
+                    source_text_comp.append(gr.Textbox(
+                        lines=4,
+                        max_lines=4,
+                        interactive=False,
+                        visible=False,
+                    ))
+                upload_another_doc_btn = gr.Button("Upload another document", visible=False)
+            open_ai_embedding = embedding_functions.OpenAIEmbeddingFunction(api_key=os.environ['OPENAI_API_KEY'], model_name="text-embedding-ada-002")
+            with gr.Column(scale=7):
+                collections_list = gr.Radio(choices=[a.name for a in ctrl.client_db.list_collections()],
+                    label="Current collections in the database",
+                    visible=True,
+                    info="Choose a collection to query."
+                )
+                delete_database_btn = gr.Button("Delete current collection", visible=False)
+        def input_doc_fn(input_doc_, include_images_, actual_page_start_):
+            result = ctrl.upload_doc(input_doc_,include_images_, actual_page_start_)
+            if result == True:
+                return {
+                    input_doc_comp: gr.update(visible=False),
+                    input_text_comp: gr.update(visible=True),
+                    input_example_comp: gr.update(visible=True),
+                    clear_btn: gr.update(visible=True),
+                    include_images_btn: gr.update(visible=False,value=include_images_),
+                    delete_database_btn: gr.update(visible=True),
+                    upload_another_doc_btn: gr.update(visible=True),
+                    collections_list: gr.update(choices=[a.name for a in ctrl.client_db.list_collections()],value=ctrl.retriever.collection.name),
+                    page_start_warning: gr.update(visible=False),
+                    actual_page_start: gr.update(visible=False),
+                }
+            else:
+                gr.Warning("File extension not supported. Only .docx, .pdf and .html are supported.")
+                return {
+                    input_doc_comp: gr.update(visible=True),
+                    input_text_comp: gr.update(visible=False),
+                    input_example_comp: gr.update(visible=False),
+                    clear_btn: gr.update(visible=False),
+                    include_images_btn: gr.update(visible=True,value=include_images_),
+                    page_start_warning: gr.update(visible=True),
+                    actual_page_start: gr.update(visible=True, value=1),
+                }
+        def input_file_clear():
+            update_ = {
+                input_doc_comp: gr.update(visible=True, value=None),
+                clear_btn: gr.update(visible=False),
+                input_text_comp: gr.update(value='', visible=False),
+                histo_text_comp: gr.update(value='', visible=False),
+                input_example_comp: gr.update(value='', visible=False),
+                include_images_btn: gr.update(visible=True),
+                upload_another_doc_btn: gr.update(visible=False),
+                delete_database_btn: gr.update(visible=True),
+                page_start_warning: gr.update(visible=True),
+                actual_page_start: gr.update(visible=True, value=1),
+                collections_list: gr.update(value=None, choices=[a.name for a in ctrl.client_db.list_collections()]),
+            }
+            for i in range(4):
+                update_[source_text_comp[i]] = gr.update(visible=False, value='hello')
+            return update_
+        def input_text_fn1(input_text_, histo_text_):
+            histo_text_.append((input_text_, None))
+            update_ = {
+                histo_text_comp: gr.update(visible=True, value=histo_text_),
+                input_example_comp: gr.update(visible=False,),
+            }
+            for i in range(4):
+                update_[source_text_comp[i]] = gr.update(visible=False)
+            return update_
+        def input_text_fn2(input_text_, histo_text_):
+            answer, sources = ctrl.get_response(query=input_text_, histo=histo_text_)
+            histo_text_[-1] = (input_text_, answer)
+            update_ = {
+                histo_text_comp: gr.update(value=histo_text_),
+                input_text_comp: gr.update(value=''),
+            }
+            for i in range(min(len(sources), 3)):
+                s = sources[i]
+                if i != 0:
+                    prev = sources[i - 1]
+                    if prev.index == s.index:
+                        continue
+                source_label = f'{s.index}   {s.title}                        score = {s.distance_str}'
+                source_text = s.content
+                update_[source_text_comp[i]] = gr.update(visible=True, value=source_text, label=source_label)
+            return update_
+        def input_example_fn(input_example_, histo_text_):
+            histo_text_.append((input_example_, None))
+            update_ = {
+                input_text_comp: gr.update(value=input_example_),
+                histo_text_comp: gr.update(visible=True, value=histo_text_),
+                input_example_comp: gr.update(visible=False, value=''),
+            }
+            for i in range(4):
+                update_[source_text_comp[i]] = gr.update(visible=False)
+            return update_
+        def clear_fn():
+            update_ = {
+                input_text_comp: gr.update(value=''),
+                histo_text_comp: gr.update(value='', visible=False),
+                input_example_comp: gr.update(value='', visible=True),
+                upload_another_doc_btn: gr.update(visible=True),
+            }
+            for i in range(4):
+                update_[source_text_comp[i]] = gr.update(visible=False, value='hello')
+            return update_
+        def list_all_chroma_collections():
+            update = {
+                collections_list: gr.update(choices=[a.name for a in ctrl.client_db.list_collections()]),
+            }
+            return update
+        def change_collection(collection_name):
+            ctrl.retriever.collection = ctrl.client_db.get_collection(collection_name, embedding_function=open_ai_embedding)
+            return {
+                delete_database_btn: gr.update(visible=True),
+                input_doc_comp: gr.update(visible=False,value=None),
+                input_text_comp: gr.update(visible=True, value=''),
+                input_example_comp: gr.update(visible=True),
+                clear_btn: gr.update(visible=True),
+                collections_list: gr.update(choices=[a.name for a in ctrl.client_db.list_collections()]),
+                include_images_btn: gr.update(visible=False),
+                histo_text_comp: gr.update(visible=False, value=''),
+                upload_another_doc_btn: gr.update(visible=True),
+                actual_page_start: gr.update(visible=False),
+                page_start_warning: gr.update(visible=False),
+            }
+        def delete_curr_database():
+            ctrl.client_db.delete_collection(ctrl.retriever.collection.name)
+            gr.Info(f"Collection {ctrl.retriever.collection.name} deleted from the database")
+            return {
+                delete_database_btn: gr.update(visible=False),
+                input_doc_comp: gr.update(visible=True,value=None),
+                input_text_comp: gr.update(visible=False, value=''),
+                input_example_comp: gr.update(visible=False),
+                clear_btn: gr.update(visible=False),
+                collections_list: gr.update(choices=[a.name for a in ctrl.client_db.list_collections()]),
+                include_images_btn: gr.update(visible=True),
+                histo_text_comp: gr.update(visible=False, value=''),
+                upload_another_doc_btn: gr.update(visible=False),
+                actual_page_start: gr.update(visible=True, value=1),
+                page_start_warning: gr.update(visible=True),
+            }
+        upload_another_doc_btn.click(input_file_clear,
+                        inputs=None,
+                        outputs=[collections_list, page_start_warning, actual_page_start, input_doc_comp, input_text_comp, input_example_comp, clear_btn, include_images_btn, histo_text_comp, delete_database_btn,upload_another_doc_btn, source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
+        delete_database_btn.click(delete_curr_database,
+                        inputs=None,
+                        outputs=[page_start_warning, actual_page_start, delete_database_btn, input_doc_comp, input_text_comp, input_example_comp, clear_btn, collections_list, include_images_btn, histo_text_comp, upload_another_doc_btn])
+        collections_list.input(change_collection,
+                        inputs=[collections_list],
+                        outputs=[actual_page_start, page_start_warning, collections_list, input_text_comp, input_example_comp, clear_btn, include_images_btn, histo_text_comp, input_doc_comp, delete_database_btn,upload_another_doc_btn])
+        input_doc_comp \
+            .upload(input_doc_fn,
+                     inputs=[input_doc_comp, include_images_btn, actual_page_start],
+                     outputs=[page_start_warning, actual_page_start, input_doc_comp, input_text_comp,upload_another_doc_btn,
+                             input_example_comp, include_images_btn, clear_btn, histo_text_comp, delete_database_btn,collections_list, source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
+            .then(list_all_chroma_collections,
+                  inputs=None,
+                  outputs=[collections_list])
+        input_doc_comp \
+            .clear(input_file_clear,
+                    inputs=None,
+                    outputs=[page_start_warning, actual_page_start, input_doc_comp, clear_btn, upload_another_doc_btn, input_text_comp, histo_text_comp, input_example_comp, include_images_btn, delete_database_btn,
+                                source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
+        input_text_comp \
+            .submit(input_text_fn1,
+                    inputs=[input_text_comp, histo_text_comp],
+                    outputs=[histo_text_comp, input_example_comp,
+                             source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
+            .then(input_text_fn2,
+                  inputs=[input_text_comp, histo_text_comp],
+                  outputs=[input_text_comp, histo_text_comp,
+                           source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
+        input_example_comp \
+            .input(input_example_fn,
+                   inputs=[input_example_comp, histo_text_comp],
+                   outputs=[input_text_comp, histo_text_comp, input_example_comp,
+                            source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
+            .then(input_text_fn2,
+                  inputs=[input_text_comp, histo_text_comp],
+                  outputs=[input_text_comp, histo_text_comp,
+                           source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
+        clear_btn.click(clear_fn,
+                        inputs=None,
+                        outputs=[input_text_comp, histo_text_comp, input_example_comp,upload_another_doc_btn,
+                                 source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
+    return qna

styles.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+Document title: Defoe_RobinsonCrusoe1.pdf
+{
+    "content": [
+        11.0,
+        13.300000000000011,
+        15.999999999999943,
+        15.999999999999986,
+        16.0,
+        16.000000000000007,
+        16.00000000000003
+    ],
+    "title2": [
+        23.0
+    ],
+    "title1": [
+        27.600000000000023
+    ]
+}