Spaces:

jdwh08s
/

Autodoc-Lifter

Paused

App Files Files Community

Jonathan Wang commited on Sep 5, 2024

Commit

89cbc4d

0 Parent(s):

initial commit

Browse files

Files changed (26) hide show

.gitattributes +7 -0
.gitignore +171 -0
.streamlit/config.toml +2 -0
.vscode/launch.json +3 -0
LICENSE +661 -0
README.md +53 -0
agent.py +92 -0
app.py +471 -0
citation.py +245 -0
engine.py +126 -0
full_doc.py +336 -0
keywords.py +110 -0
merger.py +174 -0
metadata_adder.py +280 -0
models.py +785 -0
obs_logging.py +380 -0
packages.txt +4 -0
parsers.py +106 -0
pdf_reader.py +528 -0
pdf_reader_utils.py +592 -0
prompts.py +86 -0
pyproject.toml +53 -0
requirements.txt +34 -0
retriever.py +280 -0
storage.py +120 -0
summary.py +246 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,7 @@

+nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle filter=lfs diff=lfs merge=lfs -text
+nltk_data/tokenizers/punkt/english.pickle filter=lfs diff=lfs merge=lfs -text
+nltk_data/tokenizers/punkt/PY3/english.pickle filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.tab filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,171 @@

+##### LOCAL PROJECT FILES #####
+data/
+refs/
+figures/
+config.py
+.streamlit/secrets.toml
+###############################
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [browser]
2	+ gatherUsageStats = false

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f435c38bfb7c91633a094d3ca2f8224839fb2151158536bda1ca0de4b395b426
+size 624

LICENSE ADDED Viewed

	@@ -0,0 +1,661 @@

+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU Affero General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Remote Network Interaction; Use with the GNU General Public License.
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.

README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+---
+title: Autodoc Lifter
+emoji: 🦊📝
+colorFrom: yellow
+colorTo: red
+python_version: 3.11.9
+sdk: streamlit
+sdk_version: 1.37.1
+suggested_hardware: t4-small
+suggested_storage: small
+app_file: app.py
+header: mini
+short_description: Good Local RAG for Bad PDFs
+models: [timm/resnet18.a1_in1k, microsoft/table-transformer-detection, mixedbread-ai/mxbai-embed-large-v1, mixedbread-ai/mxbai-rerank-large-v1, meta-llama/Meta-Llama-3.1-8B-Instruct, Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5]
+tags: [rag, llm, pdf, document]
+license: agpl-3.0
+pinned: true
+preload_from_hub:
+    - timm/resnet18.a1_in1k
+    - microsoft/table-transformer-detection
+    - mixedbread-ai/mxbai-embed-large-v1
+    - mixedbread-ai/mxbai-rerank-large-v1
+    - Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5
+---
+## Autodoc Lifter
+Document RAG system with LLMs.
+Some key goals for the project, once finished:
+0. All open, all local.
+    I don't want to be calling APIs. You can the entire app locally, and inspect the code and models.
+    This is particularly suitable for handling restricted information.
+    Yes I know this is a web demo on Spaces, so don't actually do that here.
+    Use the GitHub link: (here, once it's no longer ClosedAI)
+1. Support for atrocious and varied PDFs.
+    Have images? Have tables? Have a set of PDFs with the worst quality and page layout known to man?
+    Give it a try in here. I've been slowly building out custom processing for difficult documents by connecting Unstructured.IO to LlamaIndex in a slightly useful way.
+    (A future dream: get rid of Unstructured and build our own pipeline one day.)
+2. Multiple PDFs, handled with agents.
+    Instead of dumping all the documents into one central vector store and praying it works out,
+    I'm try to be more thoughtful as to how to incorporate multiple documents.
+3. Answers that are sourced and verifiable.
+    I'm sorry, but as an Definitely Human Person, I don't like hallucinated answers-ex-machina.
+    Responses should give actual citations \[0\] when pulling text directly from source documents,
+    and there should be a way to view the citations, referenced text, and the document itself.
+    --- CITATIONS ---
+    \[0\] Relies primarily on fuzzy string matching, because it's computationally cheaper and also
+    ensures that cited text actually occurs in the source documents.

agent.py ADDED Viewed

	@@ -0,0 +1,92 @@

+#####################################################
+### DOCUMENT PROCESSOR [AGENT]
+#####################################################
+### Jonathan Wang
+# ABOUT:
+# This creates an app to chat with PDFs.
+# This is the AGENT
+# which handles complex questions about the PDF.
+#####################################################
+### TODO Board:
+# https://docs.llamaindex.ai/en/stable/examples/agent/agent_runner/agent_runner_rag_controllable/#setup-human-in-the-loop-chat
+# Investigate ObjectIndex and retrievers? https://docs.llamaindex.ai/en/stable/examples/agent/multi_document_agents/
+# https://docs.llamaindex.ai/en/stable/module_guides/storing/chat_stores/
+#####################################################
+### IMPORTS
+from typing import List
+from streamlit import session_state as ss
+from llama_index.core.settings import Settings
+from llama_index.core.tools import QueryEngineTool, ToolMetadata
+from llama_index.core.query_engine import SubQuestionQueryEngine
+# Own Modules
+from full_doc import FullDocument
+#####################################################
+### CODE
+ALLOWED_DOCUMENT_TOOLS = ['engine', 'subquestion_engine']
+ALLOWED_TOOLS = ALLOWED_DOCUMENT_TOOLS
+def _build_tool_from_fulldoc(fulldoc: FullDocument, tool_name: str) -> QueryEngineTool:
+        """Given a Full Document, build a QueryEngineTool from the specified engine.
+        Args:
+            fulldoc (FullDocument): The FullDocument (doc + query engines)
+            tool_name (str): The engine to use.
+        Returns:
+            QueryEngineTool: A query engine wrapper around the tool.
+        """
+        if (tool_name.lower() not in ALLOWED_DOCUMENT_TOOLS):
+            raise ValueError("`tool_name` must be one of {ALLOWED_DOCUMENT_TOOLS}")
+        if (getattr(fulldoc, tool_name, None) is None):
+            raise ValueError(f"`{tool_name}` must be created from the document first.")
+        # Build Tool
+        tool_description = ''
+        if tool_name == 'engine':
+            tool_description += 'A tool that answers simple questions about the following document:\n' + fulldoc.summary_oneline
+        elif tool_name == 'subquestion_engine':
+            tool_description += 'A tool that answers complex questions about the following document:\n' + fulldoc.summary_oneline
+        tool = QueryEngineTool(
+            query_engine=getattr(fulldoc, tool_name),
+            metadata=ToolMetadata(
+                name=tool_name,
+                description=tool_description
+            ),
+        )
+        return tool
+def doclist_to_agent(doclist: List[FullDocument], fulldoc_tools_to_use: List[str]=['engine']) -> SubQuestionQueryEngine: # ReActAgent:
+    # Agent Tools
+    agent_tools = []
+    # Remove any tools that are not in the allowed list using
+    tools_to_use = list(set(fulldoc_tools_to_use).intersection(set(ALLOWED_DOCUMENT_TOOLS)))
+    if (len(tools_to_use) < len(fulldoc_tools_to_use)):
+        removed_tools = set(fulldoc_tools_to_use) - set(ALLOWED_DOCUMENT_TOOLS)
+        Warning(f"Tools {removed_tools} are not in the allowed list of tools. Skipping...")
+        del removed_tools
+    for tool in tools_to_use:
+        for doc in doclist:
+            agent_tools.append(_build_tool_from_fulldoc(doc, tool))
+    # Agent
+    # agent = ReActAgent.from_tools(
+    agent = SubQuestionQueryEngine.from_defaults(
+        # tools=agent_tools,
+        query_engine_tools=agent_tools,
+        llm=Settings.llm or ss.llm,
+        verbose=True,
+        # max_iterations=5
+    )
+    return agent

app.py ADDED Viewed

	@@ -0,0 +1,471 @@

+#####################################################
+### DOCUMENT PROCESSOR [APP]
+#####################################################
+### Jonathan Wang
+# ABOUT:
+# This creates an app to chat with PDFs.
+# This is the APP
+# which runs the backend and codes the frontend UI.
+#####################################################
+### TODO Board:
+# Try ColPali? https://huggingface.co/vidore/colpali
+#####################################################
+### PROGRAM IMPORTS
+from __future__ import annotations
+import base64
+import gc
+import logging
+import os
+import random
+import sys
+import warnings
+from pathlib import Path
+from typing import Any, cast
+import nest_asyncio
+import numpy as np
+import streamlit as st
+from llama_index.core import Settings, get_response_synthesizer
+from llama_index.core.base.llms import BaseLLM
+from llama_index.core.postprocessor import (
+    SentenceEmbeddingOptimizer,
+    SimilarityPostprocessor,
+)
+from llama_index.core.response_synthesizers import ResponseMode
+from streamlit import session_state as ss
+from summary import (
+    ImageSummaryMetadataAdder,
+    TableSummaryMetadataAdder,
+    get_tree_summarizer,
+)
+from torch.cuda import (
+    empty_cache,
+    get_device_name,
+    is_available,
+    manual_seed,
+    mem_get_info,
+)
+from transformers import set_seed
+# Own Modules
+from agent import doclist_to_agent
+from citation import get_citation_builder
+from full_doc import FullDocument
+from keywords import KeywordMetadataAdder
+from metadata_adder import UnstructuredPDFPostProcessor
+from models import get_embedder, get_llm, get_multimodal_llm, get_reranker
+from obs_logging import get_callback_manager, get_obs
+from pdf_reader import UnstructuredPDFReader
+from pdf_reader_utils import (
+    chunk_by_header,
+    clean_abbreviations,
+    combine_listitem_chunks,
+    dedupe_title_chunks,
+    remove_header_footer_repeated,
+)
+from parsers import get_parser
+from prompts import get_qa_prompt, get_refine_prompt
+#####################################
+### SETTINGS
+# Logging
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+# CUDA GPU memory avoid fragmentation.
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # avoid vram frag
+os.environ["MAX_SPLIT_SIZE_MB"] = "128"
+os.environ["SCARF_NO_ANALYTICS"] = "true"  # get rid of data collection from Unstructured
+os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
+os.environ["HF_HOME"] = "/data/.huggingface"  # save cached models on disk.
+SEED = 31415926
+print(f"CUDA Availablility: {is_available()}")
+print(f"CUDA Device Name: {get_device_name()}")
+print(f"CUDA Memory: {mem_get_info()}")
+gc.collect()
+empty_cache()
+# Asyncio: fix some issues with nesting https://github.com/run-llama/llama_index/issues/9978
+nest_asyncio.apply()
+# Set seeds
+if (random.getstate() is None):
+    random.seed(SEED)  # python
+    np.random.seed(SEED)  # numpy  # TODO(Jonathan Wang): Replace with generator
+    manual_seed(SEED)  # pytorch
+    set_seed(SEED)  # transformers
+# API Keys
+os.environ["HF_TOKEN"] = st.secrets["huggingface_api_token"]
+os.environ["OPENAI_API_KEY"] = st.secrets["openai_api_key"]
+os.environ["GROQ_API_KEY"] = st.secrets["groq_api_key"]
+#########################################################################
+### SESSION STATE INITIALIZATION
+st.set_page_config(layout="wide")
+if "pdf_ref" not in ss:
+    ss.input_pdf = []
+if "doclist" not in ss:
+    ss.doclist = []
+if "pdf_reader" not in ss:
+    ss.pdf_reader = None
+if "pdf_postprocessor" not in ss:
+    ss.pdf_postprocessor = None
+# if 'sentence_model' not in ss:
+    # ss.sentence_model = None  # sentence splitting model, as alternative to nltk/PySBD
+if "embed_model" not in ss:
+    ss.embed_model = None
+    gc.collect()
+    empty_cache()
+if "reranker_model" not in ss:
+    ss.reranker_model = None
+    gc.collect()
+    empty_cache()
+if "llm" not in ss:
+    ss.llm = None
+    gc.collect()
+    empty_cache()
+if "multimodal_llm" not in ss:
+    ss.multimodal_llm = None
+    gc.collect()
+    empty_cache()
+if "callback_manager" not in ss:
+    ss.callback_manager = None
+if "node_parser" not in ss:
+    ss.node_parser = None
+if "node_postprocessors" not in ss:
+    ss.node_postprocessors = None
+if "response_synthesizer" not in ss:
+    ss.response_synthesizer = None
+if "tree_summarizer" not in ss:
+    ss.tree_summarizer = None
+if "citation_builder" not in ss:
+    ss.citation_builder = None
+if "agent" not in ss:
+    ss.agent = None
+if "observability" not in ss:
+    ss.observability = None
+if "uploaded_files" not in ss:
+    ss.uploaded_files = []
+if "selected_file" not in ss:
+    ss.selected_file = None
+if "chat_messages" not in ss:
+    ss.chat_messages = []
+################################################################################
+### SCRIPT
+st.markdown("""
+        <style>
+                .block-container {
+                    padding-top: 3rem;
+                    padding-bottom: 0rem;
+                    padding-left: 3rem;
+                    padding-right: 3rem;
+                }
+        </style>
+        """, unsafe_allow_html=True)
+### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+### UI
+st.text("Autodoc Lifter Local PDF Chatbot (Built with Meta🦙3)")
+col_left, col_right = st.columns([1, 1])
+### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+### PDF Upload UI (Left Panel)
+with st.sidebar:
+    uploaded_files = st.file_uploader(
+        label="Upload a PDF file.",
+        type="pdf",
+        accept_multiple_files=True,
+        label_visibility="collapsed",
+    )
+### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+### PDF Display UI (Middle Panel)
+# NOTE: This currently only displays the PDF, which requires user interaction (below)
+### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+### Chat UI (Right Panel)
+with col_right:
+    messages_container = st.container(height=475, border=False)
+    input_container = st.container(height=80, border=False)
+with messages_container:
+    for message in ss.chat_messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+with input_container:
+    # Accept user input
+    prompt = st.chat_input("Ask your question about the document here.")
+### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+### Get Models and Settings
+# Get Vision LLM
+if (ss.multimodal_llm is None):
+    print(f"CUDA Memory Pre-VLLM: {mem_get_info()}")
+    vision_llm = get_multimodal_llm()
+    ss.multimodal_llm = vision_llm
+# Get LLM
+if (ss.llm is None):
+    print(f"CUDA Memory Pre-LLM: {mem_get_info()}")
+    llm = get_llm()
+    ss.llm = llm
+    Settings.llm = cast(llm, BaseLLM)
+# Get Sentence Splitting Model.
+# if (ss.sentence_model is None):
+#     sent_splitter = get_sat_sentence_splitter('sat-3l-sm')
+#     ss.sentence_model = sent_splitter
+# Get Embedding Model
+if (ss.embed_model is None):
+    print(f"CUDA Memory Pre-Embedding: {mem_get_info()}")
+    embed_model = get_embedder()
+    ss.embed_model = embed_model
+    Settings.embed_model = embed_model
+# Get Reranker
+if (ss.reranker_model is None):
+    print(f"CUDA Memory Pre-Reranking: {mem_get_info()}")
+    ss.reranker_model = get_reranker()
+# Get Callback Manager
+if (ss.callback_manager is None):
+    callback_manager = get_callback_manager()
+    ss.callback_manager = callback_manager
+    Settings.callback_manager = callback_manager
+# Get Node Parser
+if (ss.node_parser is None):
+    node_parser = get_parser(
+        embed_model=Settings.embed_model,
+        callback_manager=ss.callback_manager
+    )
+    ss.node_parser = node_parser
+    Settings.node_parser = node_parser
+#### Get Observability
+if (ss.observability is None):
+    obs = get_obs()
+### Get PDF Reader
+if (ss.pdf_reader is None):
+    ss.pdf_reader = UnstructuredPDFReader()
+### Get PDF Reader Postprocessing
+if (ss.pdf_postprocessor is None):
+    # Get embedding
+    # regex_adder = RegexMetadataAdder(regex_pattern=)  # Are there any that I need?
+    keyword_adder = KeywordMetadataAdder(metadata_name="keywords")
+    table_summary_adder = TableSummaryMetadataAdder(llm=ss.llm)
+    image_summary_adder = ImageSummaryMetadataAdder(llm=ss.multimodal_llm)
+    pdf_postprocessor = UnstructuredPDFPostProcessor(
+        embed_model=ss.embed_model,
+        metadata_adders=[keyword_adder, table_summary_adder, image_summary_adder]
+    )
+    ss.pdf_postprocessor = pdf_postprocessor
+#### Get Observability
+if (ss.observability is None):
+    ss.observability = get_obs()
+    observability = ss.observability
+### Get Node Postprocessor Pipeline
+if (ss.node_postprocessors is None):
+    from nltk.tokenize import PunktTokenizer
+    punkt_tokenizer = PunktTokenizer()
+    ss.node_postprocessors = [
+        SimilarityPostprocessor(similarity_cutoff=0.01),  # remove nodes unrelated to query
+        ss.reranker_model,  # rerank
+        # remove sentences less related to query. lower is stricter
+        SentenceEmbeddingOptimizer(tokenizer_fn=punkt_tokenizer.tokenize, percentile_cutoff=0.2),
+    ]
+### Get Response Synthesizer
+if (ss.response_synthesizer is None):
+    ss.response_synthesizer = get_response_synthesizer(
+        response_mode=ResponseMode.COMPACT,
+        text_qa_template=get_qa_prompt(),
+        refine_template=get_refine_prompt()
+    )
+### Get Tree Summarizer
+if (ss.tree_summarizer is None):
+    ss.tree_summarizer = get_tree_summarizer()
+### Get Citation Builder
+if (ss.citation_builder is None):
+    ss.citation_builder = get_citation_builder()
+### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+### Handle User Interaction
+def handle_new_pdf(file_io: Any) -> None:
+    """Handle processing a new source PDF file document."""
+    with st.sidebar:
+        with (st.spinner("Reading input file, this make take some time...")):
+            ### Save Locally
+            # TODO(Jonathan Wang): Get the user to upload their file with a reference name in a separate tab.
+            if not Path(__file__).parent.joinpath("data").exists():
+                print("NEWPDF: Making data directory...")
+                Path(__file__).parent.joinpath("data").mkdir(parents=True)
+            with open(Path(__file__).parent.joinpath("data/input.pdf"), "wb") as f:
+                print("NEWPDF: Writing input file...")
+                f.write(file_io.getbuffer())
+            ### Create Document
+            print("NEWPDF: Building Document...")
+            new_document = FullDocument(
+                name="input.pdf",
+                file_path=Path(__file__).parent.joinpath("data/input.pdf"),
+            )
+            #### Process document.
+            print("NEWPDF: Writing input file...")
+            new_document.file_to_nodes(
+                reader=ss.pdf_reader,
+                postreaders=[
+                    clean_abbreviations, dedupe_title_chunks, combine_listitem_chunks,
+                    remove_header_footer_repeated, chunk_by_header
+                ],
+                node_parser=ss.node_parser,
+                postparsers=[ss.pdf_postprocessor],
+            )
+        ### Get Storage Context
+        with (st.spinner("Processing input file, this make take some time...")):
+            new_document.nodes_to_summary(summarizer=ss.tree_summarizer)
+            new_document.summary_to_oneline(summarizer=ss.tree_summarizer)
+            new_document.nodes_to_document_keywords()
+            new_document.nodes_to_storage()
+    ### Get Retrieval on Vector Store Index
+        with (st.spinner("Building retriever for the input file...")):
+            new_document.storage_to_retriever(callback_manager=ss.callback_manager)
+    ### Get LLM Query Engine
+        with (st.spinner("Building query responder for the input file...")):
+            new_document.retriever_to_engine(
+                response_synthesizer=ss.response_synthesizer,
+                callback_manager=ss.callback_manager
+            )
+            new_document.engine_to_sub_question_engine()
+    ### Officially Add to Document List
+        ss.uploaded_files.append(uploaded_file)  # Left UI Bar
+        ss.doclist.append(new_document)  # Document list for RAG.  # TODO(Jonathan Wang): Fix potential duplication.
+    ### Get LLM Agent
+        with (st.spinner("Building LLM Agent for the input file...")):
+            agent = doclist_to_agent(ss.doclist)
+            ss.agent = agent
+    # All done!
+    st.toast("All done!")
+    # Display summary of new document in chat.
+    with messages_container:
+        ss.chat_messages.append(
+            {"role": "assistant", "content": new_document.summary_oneline}
+        )
+        with st.chat_message("assistant"):
+            st.markdown(new_document.summary_oneline)
+    ### Cleaning
+    empty_cache()
+    gc.collect()
+def handle_chat_message(user_message: str) -> str:
+    # Get Response
+    if (not hasattr(ss, "doclist") or len(ss.doclist) == 0):
+        return "Please upload a document to get started."
+    if (not hasattr(ss, "agent")):
+        warnings.warn("No LLM Agent found. Attempting to create one.", stacklevel=2)
+        with st.sidebar, (st.spinner("Building LLM Agent for the input file...")):
+            agent = doclist_to_agent(ss.doclist)
+            ss.agent = agent
+    response = ss.agent.query(user_message)
+    # Get citations if available
+    response = ss.citation_builder.get_citations(response, citation_threshold=60)
+    # Add citations to response text
+    response_with_citations = ss.citation_builder.add_citations_to_response(response)
+    return str(response_with_citations.response)
+@st.cache_data
+def get_pdf_display(
+    file: Any,
+    app_width: str = "100%",
+    app_height: str = "500",
+    starting_page_number: int | None = None
+) -> str:
+    # Read file as binary
+    file_bytes = file.getbuffer()
+    base64_pdf = base64.b64encode(file_bytes).decode("utf-8")
+    pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}"'  # TODO(Jonathan Wang): iframe vs embed
+    if starting_page_number is not None:
+        pdf_display += f"#page={starting_page_number}"
+    pdf_display += f' width={app_width} height="{app_height}" type="application/pdf"></iembed>'  # iframe vs embed
+    return (pdf_display)
+# Upload
+with st.sidebar:
+    uploaded_files = uploaded_files or []  # handle case when no file is uploaded
+    for uploaded_file in uploaded_files:
+        if (uploaded_file not in ss.uploaded_files):
+            handle_new_pdf(uploaded_file)
+    if (ss.selected_file is None and ss.uploaded_files):
+        ss.selected_file = ss.uploaded_files[-1]
+    file_names = [file.name for file in ss.uploaded_files]
+    selected_file_name = st.radio("Uploaded Files:", file_names)
+    if selected_file_name:
+        ss.selected_file = [file for file in ss.uploaded_files if file.name == selected_file_name][-1]
+with col_left:
+    if (ss.selected_file is None):
+        selected_file_name = "Upload a file."
+        st.markdown(f"## {selected_file_name}")
+    elif (ss.selected_file is not None):
+        selected_file = ss.selected_file
+        selected_file_name = selected_file.name
+        if (selected_file.type == "application/pdf"):
+            pdf_display = get_pdf_display(selected_file, app_width="100%", app_height="550")
+            st.markdown(pdf_display, unsafe_allow_html=True)
+# Chat
+if prompt:
+    with messages_container:
+        with st.chat_message("user"):
+            st.markdown(prompt)
+            ss.chat_messages.append({"role": "user", "content": prompt})
+        with st.spinner("Generating response..."):
+            # Get Response
+            response = handle_chat_message(prompt)
+        if response:
+            ss.chat_messages.append(
+                {"role": "assistant", "content": response}
+            )
+            with st.chat_message("assistant"):
+                st.markdown(response)

citation.py ADDED Viewed

	@@ -0,0 +1,245 @@

+#####################################################
+### DOCUMENT PROCESSOR [CITATION]
+#####################################################
+# Jonathan Wang
+# ABOUT:
+# This project creates an app to chat with PDFs.
+# This is the CITATION
+# which adds citation information to the LLM response
+#####################################################
+## TODO Board:
+# Investigate using LLM model weights with attention to determien citations.
+# https://gradientscience.org/contextcite/
+# https://github.com/MadryLab/context-cite/blob/main/context_cite/context_citer.py#L25
+# https://github.com/MadryLab/context-cite/blob/main/context_cite/context_partitioner.py
+# https://github.com/MadryLab/context-cite/blob/main/context_cite/solver.py
+#####################################################
+## IMPORTS
+from __future__ import annotations
+from collections import defaultdict
+from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING
+import warnings
+import numpy as np
+from llama_index.core.base.response.schema import RESPONSE_TYPE, Response
+if TYPE_CHECKING:
+    from llama_index.core.schema import NodeWithScore
+# Own Modules
+from merger import _merge_on_scores
+from rapidfuzz import fuzz, process, utils
+# Lazy Loading:
+# from nltk import sent_tokenize  # noqa: ERA001
+#####################################################
+## CODE
+class CitationBuilder:
+    """Class that builds citations from responses."""
+    text_splitter: Callable[[str], list[str]]
+    def __init__(self, text_splitter: Callable[[str], list[str]] | None = None) -> None:
+        if not text_splitter:
+            from nltk import sent_tokenize
+            text_splitter = sent_tokenize
+        self.text_splitter = text_splitter
+    @classmethod
+    def class_name(cls) -> str:
+        return "CitationBuilder"
+    def convert_to_response(self, input_response: RESPONSE_TYPE) -> Response:
+        # Convert all other response types into the baseline response
+        # Otherwise, we won't have the full response text generated.
+        if not isinstance(input_response, Response):
+            response = input_response.get_response()
+            if isinstance(response, Response):
+                return response
+            else:
+                # TODO(Jonathan Wang): Handle async responses with Coroutines
+                msg = "Expected Response object, got Coroutine"
+                raise TypeError(msg)
+        else:
+            return input_response
+    def find_nearest_whitespace(
+        self,
+        input_text: str,
+        input_index: int,
+        right_to_left: bool=False
+    ) -> int:
+        """Given a sting and an index, find the index of whitespace closest to the string."""
+        if (input_index < 0  or input_index >= len(input_text)):
+            msg = "find_nearest_whitespace: index beyond string."
+            raise ValueError(msg)
+        find_text = ""
+        if (right_to_left):
+            find_text = input_text[:input_index]
+            for index, char in enumerate(reversed(find_text)):
+                if (char.isspace()):
+                    return (len(find_text)-1 - index)
+            return (0)
+        else:
+            find_text = input_text[input_index:]
+            for index, char in enumerate(find_text):
+                if (char.isspace()):
+                    return (input_index + index)
+            return (len(input_text))
+    def get_citations(
+        self,
+        input_response: RESPONSE_TYPE,
+        citation_threshold: int = 70,
+        citation_len: int = 128
+    ) -> Response:
+        response = self.convert_to_response(input_response)
+        if not response.response or not response.source_nodes:
+            return response
+        # Get current response text:
+        response_text = response.response
+        source_nodes = response.source_nodes
+        # 0. Get candidate nodes for citation.
+        # Fuzzy match each source node text against the respone text.
+        source_texts: dict[str, list[NodeWithScore]] = defaultdict(list)
+        for node in source_nodes:
+            if (
+                (len(getattr(node.node, "text", "")) > 0) and
+                (len(node.node.metadata) > 0)
+            ):  # filter out non-text nodes and intermediate nodes from SubQueryQuestionEngine
+                source_texts[node.node.text].append(node)  # type: ignore
+        fuzzy_matches = process.extract(
+            response_text,
+            list(source_texts.keys()),
+            scorer=fuzz.partial_ratio,
+            processor=utils.default_process,
+            score_cutoff=max(10, citation_threshold - 10)
+        )
+        # Convert extracted matches of form (Match, Score, Rank) into scores for all source_texts.
+        if fuzzy_matches:
+            fuzzy_texts, _, _ = zip(*fuzzy_matches)
+            fuzzy_nodes = [source_texts[text][0] for text in fuzzy_texts]
+        else:
+            return response
+        # 1. Combine fuzzy score and source text semantic/reranker score.
+        # NOTE: for our merge here, we value the nodes with strong fuzzy text matching over other node types.
+        cited_nodes = _merge_on_scores(
+            a_list=fuzzy_nodes,
+            b_list=source_nodes,  # same nodes, different scores (fuzzy vs semantic/bm25/reranker)
+            a_scores_input=[getattr(node, "score", np.nan) for node in fuzzy_nodes],
+            b_scores_input=[getattr(node, "score", np.nan) for node in source_nodes],
+            a_weight=0.85,  # we want to heavily prioritize the fuzzy text for matches
+            top_k=3  # maximum of three source options.
+        )
+        # 2. Add cited nodes text to the response text, and cited nodes as metadata.
+        # For each sentence in the response, if there is a match in the source text, add a citation tag.
+        response_sentences = self.text_splitter(response_text)
+        output_text = ""
+        output_citations = ""
+        citation_tag = 0
+        for response_sentence in response_sentences:
+            # Get fuzzy citation at sentence level
+            best_alignment = None
+            best_score = 0
+            best_node = None
+            for _, source_node in enumerate(source_nodes):
+                source_node_text = getattr(source_node.node, "text", "")
+                new_alignment = fuzz.partial_ratio_alignment(
+                    response_sentence,
+                    source_node_text,
+                    processor=utils.default_process, score_cutoff=citation_threshold
+                )
+                new_score = 0.0
+                if (new_alignment is not None and (new_alignment.src_end - new_alignment.src_start) > 0):
+                    new_score = fuzz.ratio(
+                        source_node_text[new_alignment.src_start:new_alignment.src_end],
+                        response_sentence[new_alignment.dest_start:new_alignment.dest_end],
+                        processor=utils.default_process
+                    )
+                    new_score = new_score * (new_alignment.src_end - new_alignment.src_start) / float(len(response_sentence))
+                    if (new_score > best_score):
+                        best_alignment = new_alignment
+                        best_score = new_score
+                        best_node = source_node
+            if (best_score <= 0 or best_node is None or best_alignment is None):
+                # No match
+                output_text += response_sentence
+                continue
+            # Add citation tag to text
+            citation_tag_position = self.find_nearest_whitespace(response_sentence, best_alignment.dest_start, right_to_left=True)
+            output_text += response_sentence[:citation_tag_position]  # response up to the quote
+            output_text += f" [{citation_tag}] "  # add citation tag
+            output_text += response_sentence[citation_tag_position:]  # reposnse after the quote
+            # Add citation text to citations
+            citation = getattr(best_node.node, "text", "")
+            citation_margin = round((citation_len - (best_alignment.src_end - best_alignment.src_start)) / 2)
+            nearest_whitespace_pre = self.find_nearest_whitespace(citation, max(0, best_alignment.src_start), right_to_left=True)
+            nearest_whitespace_post = self.find_nearest_whitespace(citation, min(len(citation)-1, best_alignment.src_end), right_to_left=False)
+            nearest_whitespace_prewindow = self.find_nearest_whitespace(citation, max(0, nearest_whitespace_pre - citation_margin), right_to_left=True)
+            nearest_whitespace_postwindow = self.find_nearest_whitespace(citation, min(len(citation)-1, nearest_whitespace_post + citation_margin), right_to_left=False)
+            citation_text = (
+                citation[nearest_whitespace_prewindow+1: nearest_whitespace_pre+1]
+                + "|||||"
+                + citation[nearest_whitespace_pre+1:nearest_whitespace_post]
+                + "|||||"
+                + citation[nearest_whitespace_post:nearest_whitespace_postwindow]
+                + f"… <<{best_node.node.metadata.get('name', '')}, Page(s) {best_node.node.metadata.get('page_number', '')}>>"
+            )
+            output_citations += f"[{citation_tag}]: {citation_text}\n\n"
+            citation_tag += 1
+        # Create output
+        if response.metadata is not None:
+            # NOTE: metadata is certainly existant by now, but the schema allows None...
+            response.metadata["cited_nodes"] = cited_nodes
+            response.metadata["citations"] = output_citations
+        response.response = output_text  # update response to include citation tags
+        return response
+    def add_citations_to_response(self, input_response: Response) -> Response:
+        if not hasattr(input_response, "metadata"):
+            msg = "Input response does not have metadata."
+            raise ValueError(msg)
+        elif input_response.metadata is None or "citations" not in input_response.metadata:
+            warnings.warn("Input response does not have citations.", stacklevel=2)
+            input_response = self.get_citations(input_response)
+        # Add citation text to response
+        if (hasattr(input_response, "metadata") and input_response.metadata.get("citations", "") != ""):
+            input_response.response = (
+                input_response.response
+                + "\n\n----- CITATIONS -----\n\n"
+                + input_response.metadata.get('citations', "")
+            )  # type: ignore
+        return input_response
+    def __call__(self, input_response: RESPONSE_TYPE, *args: Any, **kwds: Any) -> Response:
+        return self.get_citations(input_response, *args, **kwds)
+def get_citation_builder() -> CitationBuilder:
+    return CitationBuilder()

engine.py ADDED Viewed

	@@ -0,0 +1,126 @@

+#####################################################
+### DOCUMENT PROCESSOR [ENGINE]
+#####################################################
+# Jonathan Wang
+# ABOUT:
+# This project creates an app to chat with PDFs.
+# This is the ENGINE
+# which defines how LLMs handle processing.
+#####################################################
+## TODO Board:
+#####################################################
+## IMPORTS
+from __future__ import annotations
+import gc
+from typing import TYPE_CHECKING, Callable, List, Optional, cast
+from llama_index.core.query_engine import CustomQueryEngine
+from llama_index.core.schema import NodeWithScore, QueryBundle
+from llama_index.core.settings import (
+    Settings,
+)
+from torch.cuda import empty_cache
+if TYPE_CHECKING:
+    from llama_index.core.base.response.schema import Response
+    from llama_index.core.callbacks import CallbackManager
+    from llama_index.core.postprocessor.types import BaseNodePostprocessor
+    from llama_index.core.response_synthesizers import (
+        BaseSynthesizer,
+    )
+    from llama_index.core.retrievers import BaseRetriever
+# Own Modules
+#####################################################
+## CODE
+class RAGQueryEngine(CustomQueryEngine):
+    """Custom RAG Query Engine."""
+    retriever: BaseRetriever
+    response_synthesizer: BaseSynthesizer
+    node_postprocessors: Optional[List[BaseNodePostprocessor]] = []
+    # def __init__(
+    #     self,
+    #     retriever: BaseRetriever,
+    #     response_synthesizer: Optional[BaseSynthesizer] = None,
+    #     node_postprocessors: Optional[List[BaseNodePostprocessor]] = None,
+    #     callback_manager: Optional[CallbackManager] = None,
+    # ) -> None:
+    #     self._retriever = retriever
+    #     # callback_manager = (
+    #     #     callback_manager
+    #     #     Settings.callback_manager
+    #     # )
+    #     # llm = llm or Settings.llm
+    #     self._response_synthesizer = response_synthesizer or get_response_synthesizer(
+    #         # llm=llm,
+    #         # service_context=service_context,
+    #         # callback_manager=callback_manager,
+    #     )
+    #     self._node_postprocessors = node_postprocessors or []
+    #     self._metadata_mode = metadata_mode
+    #     for node_postprocessor in self._node_postprocessors:
+    #         node_postprocessor.callback_manager = callback_manager
+    #     super().__init__(callback_manager=callback_manager)
+    @classmethod
+    def class_name(cls) -> str:
+        """Class name."""
+        return "RAGQueryEngine"
+    # taken from Llamaindex CustomEngine:
+    # https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/query_engine/retriever_query_engine.py#L134
+    def _apply_node_postprocessors(
+        self, nodes: list[NodeWithScore], query_bundle: QueryBundle
+    ) -> list[NodeWithScore]:
+        if self.node_postprocessors is None:
+            return nodes
+        for node_postprocessor in self.node_postprocessors:
+            nodes = node_postprocessor.postprocess_nodes(
+                nodes, query_bundle=query_bundle
+            )
+        return nodes
+    def retrieve(self, query_bundle: QueryBundle) -> list[NodeWithScore]:
+        nodes = self.retriever.retrieve(query_bundle)
+        return self._apply_node_postprocessors(nodes, query_bundle=query_bundle)
+    async def aretrieve(self, query_bundle: QueryBundle) -> list[NodeWithScore]:
+        nodes = await self.retriever.aretrieve(query_bundle)
+        return self._apply_node_postprocessors(nodes, query_bundle=query_bundle)
+    def custom_query(self, query_str: str) -> Response:
+        # Convert query string into query bundle
+        query_bundle = QueryBundle(query_str=query_str)
+        nodes = self.retrieve(query_bundle)  # also does the postprocessing.
+        response_obj = self.response_synthesizer.synthesize(query_bundle, nodes)
+        empty_cache()
+        gc.collect()
+        return cast(Response, response_obj)  # type: ignore
+# @st.cache_resource  # none of these can be hashable or cached :(
+def get_engine(
+    retriever: BaseRetriever,
+    response_synthesizer: BaseSynthesizer,
+    node_postprocessors: list[BaseNodePostprocessor] | None = None,
+    callback_manager: CallbackManager | None = None,
+) -> RAGQueryEngine:
+    return RAGQueryEngine(
+        retriever=retriever,
+        response_synthesizer=response_synthesizer,
+        node_postprocessors=node_postprocessors,
+        callback_manager=callback_manager or Settings.callback_manager,
+    )

full_doc.py ADDED Viewed

	@@ -0,0 +1,336 @@

+#####################################################
+### DOCUMENT PROCESSOR [FULLDOC]
+#####################################################
+### Jonathan Wang
+# ABOUT:
+# This creates an app to chat with PDFs.
+# This is the FULLDOC
+# which is a class that associates documents
+# with their critical information
+# and their tools. (keywords, summary, queryengine, etc.)
+#####################################################
+### TODO Board:
+# Automatically determine which reader to use for each document based on the file type.
+#####################################################
+### PROGRAM SETTINGS
+#####################################################
+### PROGRAM IMPORTS
+from __future__ import annotations
+import asyncio
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, TypeVar
+from uuid import UUID, uuid4
+from llama_index.core import StorageContext, VectorStoreIndex
+from llama_index.core.query_engine import SubQuestionQueryEngine
+from llama_index.core.schema import BaseNode, TransformComponent
+from llama_index.core.settings import Settings
+from llama_index.core.tools import QueryEngineTool, ToolMetadata
+from streamlit import session_state as ss
+if TYPE_CHECKING:
+    from llama_index.core.base.base_query_engine import BaseQueryEngine
+    from llama_index.core.callbacks import CallbackManager
+    from llama_index.core.node_parser import NodeParser
+    from llama_index.core.readers.base import BaseReader
+    from llama_index.core.response_synthesizers import BaseSynthesizer
+    from llama_index.core.retrievers import BaseRetriever
+# Own Modules
+from engine import get_engine
+from keywords import KeywordMetadataAdder
+from retriever import get_retriever
+from storage import get_docstore, get_vector_store
+from summary import DEFAULT_ONELINE_SUMMARY_TEMPLATE, DEFAULT_TREE_SUMMARY_TEMPLATE
+#####################################################
+### SCRIPT
+GenericNode = TypeVar("GenericNode", bound=BaseNode)
+class FullDocument:
+    """Bundles all the information about a document together.
+    Args:
+        name (str): The name of the document.
+        file_path (Path): The path to the document.
+        summary (str): The summary of the document.
+        keywords (List[str]): The keywords of the document.
+        entities (List[str]): The entities of the document.
+        vector_store (BaseDocumentStore): The vector store of the document.
+    """
+    # Identifiers
+    id: UUID
+    name: str
+    file_path: Path
+    file_name: str
+    # Basic Contents
+    summary: str
+    summary_oneline: str  # A one line summary of the document.
+    keywords: set[str]  # List of keywords in document.
+    # entities: Set[str]  # list of entities in document  ## TODO: Add entities
+    metadata: dict[str, Any] | None
+    # NOTE: other metdata that might be useful:
+        # Document Creation / Last Date (e.g., recency important for legal/medical questions)
+        # Document Source and Trustworthiness
+        # Document Access Level (though this isn't important for us here.)
+        # Document Citations?
+        # Document Format? (text/spreadsheet/presentation/image/etc.)
+    # RAG Components
+    nodes: list[BaseNode]
+    storage_context: StorageContext  # NOTE: current setup has single storage context per document.
+    vector_store_index: VectorStoreIndex
+    retriever: BaseRetriever  # TODO(Jonathan Wang): Consider multiple retrievers for keywords vs semantic.
+    engine: BaseQueryEngine  # TODO(Jonathan Wang): Consider mulitple engines.
+    subquestion_engine: SubQuestionQueryEngine
+    def __init__(
+        self,
+        name: str,
+        file_path: Path | str,
+        metadata: dict[str, Any] | None = None
+    ) -> None:
+        self.id = uuid4()
+        self.name = name
+        if (isinstance(file_path, str)):
+            file_path = Path(file_path)
+        self.file_path = file_path
+        self.file_name = file_path.name
+        self.metadata = metadata
+    @classmethod
+    def class_name(cls) -> str:
+        return "FullDocument"
+    def add_name_to_nodes(self, nodes: list[GenericNode]) -> list[GenericNode]:
+        """Add the name of the document to the nodes.
+        Args:
+            nodes (List[GenericNode]): The nodes to add the name to.
+        Returns:
+            List[GenericNode]: The nodes with the name added.
+        """
+        for node in nodes:
+            node.metadata["name"] = self.name
+        return nodes
+    def file_to_nodes(
+        self,
+        reader: BaseReader,
+        postreaders: list[Callable[[list[GenericNode]], list[GenericNode]] | TransformComponent] | None=None,  # NOTE: these should be used in order. and probably all TransformComponent instead.
+        node_parser: NodeParser | None=None,
+        postparsers: list[Callable[[list[GenericNode]], list[GenericNode]] | TransformComponent] | None=None,  # Stuff like chunking, adding Embeddings, etc.
+    ) -> None:
+        """Read in the file path and get the nodes.
+        Args:
+            file_path (Optional[Path], optional): The path to the file. Defaults to file_path from init.
+            reader (Optional[BaseReader], optional): The reader to use. Defaults to reader from init.
+        """
+        # Use the provided reader to read in the file.
+        print("NEWPDF: Reading input file...")
+        nodes = reader.load_data(file_path=self.file_path)
+        # Use node postreaders to post process the nodes.
+        if (postreaders is not None):
+            for node_postreader in postreaders:
+                nodes = node_postreader(nodes)  # type: ignore  (TransformComponent allows a list of nodes)
+        # Use node parser to parse the nodes.
+        if (node_parser is None):
+            node_parser = Settings.node_parser
+            nodes = node_parser(nodes)  # type: ignore  (Document is a child of BaseNode)
+        # Use node postreaders to post process the nodes. (also add the common name to the nodes)
+        if (postparsers is None):
+            postparsers = [self.add_name_to_nodes]
+        else:
+            postparsers.append(self.add_name_to_nodes)
+        for node_postparser in postparsers:
+            nodes = node_postparser(nodes)  # type: ignore  (TransformComponent allows a list of nodes)
+        # Save nodes
+        self.nodes = nodes  # type: ignore
+    def nodes_to_summary(
+        self,
+        summarizer: BaseSynthesizer,  # NOTE: this is typically going to be a TreeSummarizer / SimpleSummarize for our use case
+        query_str: str = DEFAULT_TREE_SUMMARY_TEMPLATE,
+    ) -> None:
+        """Summarize the nodes.
+        Args:
+            summarizer (BaseSynthesizer): The summarizer to use. Takes in nodes and returns summary.
+        """
+        if (not hasattr(self, "nodes")):
+            msg = "Nodes must be extracted from document using `file_to_nodes` before calling `nodes_to_summary`."
+            raise ValueError(msg)
+        text_chunks = [getattr(node, "text", "") for node in self.nodes if hasattr(node, "text")]
+        summary_responses = summarizer.aget_response(query_str=query_str, text_chunks=text_chunks)
+        loop = asyncio.get_event_loop()
+        summary = loop.run_until_complete(summary_responses)
+        if (not isinstance(summary, str)):
+            # TODO(Jonathan Wang): ... this should always give us a string, right? we're not doing anything fancy with TokenGen/TokenAsyncGen/Pydantic BaseModel...
+            msg = f"Summarizer must return a string summary. Actual type: {type(summary)}, with value {summary}."
+            raise TypeError(msg)
+        self.summary = summary
+    def summary_to_oneline(
+        self,
+        summarizer: BaseSynthesizer,  # NOTE: this is typically going to be a SimpleSummarize / TreeSummarizer for our use case
+        query_str: str = DEFAULT_ONELINE_SUMMARY_TEMPLATE,
+    ) -> None:
+        if (not hasattr(self, "summary")):
+            msg = "Summary must be extracted from document using `nodes_to_summary` before calling `summary_to_oneline`."
+            raise ValueError(msg)
+        oneline = summarizer.get_response(query_str=query_str, text_chunks=[self.summary])  # There's only one chunk.
+        self.summary_oneline = oneline  # type: ignore | shouldn't have fancy TokenGenerators / TokenAsyncGenerators / Pydantic BaseModels
+    def nodes_to_document_keywords(self, keyword_extractor: Optional[KeywordMetadataAdder] = None) -> None:
+        """Save the keywords from the nodes into the document.
+        Args:
+            keyword_extractor (Optional[BaseKeywordExtractor], optional): The keyword extractor to use. Defaults to None.
+        """
+        if (not hasattr(self, "nodes")):
+            msg = "Nodes must be extracted from document using `file_to_nodes` before calling `nodes_to_keywords`."
+            raise ValueError(msg)
+        if (keyword_extractor is None):
+            keyword_extractor = KeywordMetadataAdder()
+        # Add keywords to nodes using KeywordMetadataAdder
+        keyword_extractor.process_nodes(self.nodes)
+        # Save keywords
+        keywords: list[str] = []
+        for node in self.nodes:
+            node_keywords = node.metadata.get("keyword_metadata", "").split(", ")  # NOTE: KeywordMetadataAdder concatinates b/c required string output
+            keywords = keywords + node_keywords
+        # TODO(Jonathan Wang): handle dedupling keywords which are similar to each other (fuzzy?)
+        self.keywords = set(keywords)
+    def nodes_to_storage(self, create_new_storage: bool = True) -> None:
+        """Save the nodes to storage."""
+        if (not hasattr(self, "nodes")):
+            msg = "Nodes must be extracted from document using `file_to_nodes` before calling `nodes_to_storage`."
+            raise ValueError(msg)
+        if (create_new_storage):
+            docstore = get_docstore(documents=self.nodes)
+            self.docstore = docstore
+            vector_store = get_vector_store()
+            storage_context = StorageContext.from_defaults(
+                docstore=docstore,
+                vector_store=vector_store
+            )
+            self.storage_context = storage_context
+            vector_store_index = VectorStoreIndex(
+                self.nodes, storage_context=storage_context
+            )
+            self.vector_store_index = vector_store_index
+        else:
+            ### TODO(Jonathan Wang): use an existing storage instead of creating a new one.
+            msg = "Currently creates new storage for every document."
+            raise NotImplementedError(msg)
+    # TODO(Jonathan Wang): Create multiple different retrievers based on the question type(?)
+    # E.g., if the question is focused on specific keywords or phrases, use a retriever oriented towards sparse scores.
+    def storage_to_retriever(
+        self,
+        semantic_nodes: int = 6,
+        sparse_nodes: int = 3,
+        fusion_nodes: int = 3,
+        semantic_weight: float = 0.6,
+        merge_up_thresh: float = 0.5,
+        callback_manager: CallbackManager | None=None
+    ) -> None:
+        """Create retriever from storage."""
+        if (not hasattr(self, "vector_store_index")):
+            msg = "Vector store must be extracted from document using `nodes_to_storage` before calling `storage_to_retriever`."
+            raise ValueError(msg)
+        retriever = get_retriever(
+            _vector_store_index=self.vector_store_index,
+            semantic_top_k=semantic_nodes,
+            sparse_top_k=sparse_nodes,
+            fusion_similarity_top_k=fusion_nodes,
+            semantic_weight_fraction=semantic_weight,
+            merge_up_thresh=merge_up_thresh,
+            verbose=True,
+            _callback_manager=callback_manager or ss.callback_manager
+        )
+        self.retriever = retriever
+    def retriever_to_engine(
+        self,
+        response_synthesizer: BaseSynthesizer,
+        callback_manager: CallbackManager | None=None
+    ) -> None:
+        """Create query engine from retriever."""
+        if (not hasattr(self, "retriever")):
+            msg = "Retriever must be extracted from document using `storage_to_retriever` before calling `retriver_to_engine`."
+            raise ValueError(msg)
+        engine = get_engine(
+            retriever=self.retriever,
+            response_synthesizer=response_synthesizer,
+            callback_manager=callback_manager or ss.callback_manager
+        )
+        self.engine = engine
+    # TODO(Jonathan Wang): Create Summarization Index and Engine.
+    def engine_to_sub_question_engine(self) -> None:
+        """Convert a basic query engine into a sub-question query engine for handling complex, multi-step questions.
+        Args:
+            query_engine (BaseQueryEngine): The Base Query Engine to convert.
+        """
+        if (not hasattr(self, "summary_oneline")):
+            msg = "One Line Summary must be created for the document before calling `engine_to_sub_query_engine`"
+            raise ValueError(msg)
+        elif (not hasattr(self, "engine")):
+            msg = "Basic Query Engine must be created before calling `engine_to_sub_query_engine`"
+            raise ValueError(msg)
+        sqe_tools = [
+            QueryEngineTool(
+                query_engine=self.engine,  # TODO(Jonathan Wang): handle mulitple engines?
+                metadata=ToolMetadata(
+                    name=(self.name + "simple query answerer"),
+                    description=f"""A tool that answers simple questions about the following document: {self.summary_oneline}"""
+                )
+            )
+            # TODO(Jonathan Wang): add more tools
+        ]
+        subquestion_engine = SubQuestionQueryEngine.from_defaults(
+            query_engine_tools=sqe_tools,
+            verbose=True,
+            use_async=True
+        )
+        self.subquestion_engine = subquestion_engine

keywords.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#####################################################
+### DOCUMENT PROCESSOR [Keywords]
+#####################################################
+### Jonathan Wang
+# ABOUT:
+# This creates an app to chat with PDFs.
+# This is the Keywords
+# Which creates keywords based on documents.
+#####################################################
+### TODO Board:
+# TODO(Jonathan Wang): Add Maximum marginal relevance to the merger for better keywords.
+# TODO(Jonathan Wang): create own version of Rake keywords
+#####################################################
+### PROGRAM SETTINGS
+#####################################################
+### PROGRAM IMPORTS
+from __future__ import annotations
+from typing import Any, Callable, Optional
+# Keywords
+# from multi_rake import Rake  # removing because of compile issues and lack of maintainence
+import yake
+from llama_index.core.bridge.pydantic import Field
+from llama_index.core.schema import BaseNode
+# Own Modules
+from metadata_adder import MetadataAdder
+#####################################################
+### SCRIPT
+def get_keywords(input_text: str) -> str:
+    """
+    Given a string, get its keywords using RAKE+YAKE w/ Distribution Based Fusion.
+    Inputs:
+        input_text (str): the input text to get keywords from
+        # top_k (int): the number of keywords to get
+    Returns:
+        str: A list of the keywords, joined into a string.
+    """
+    # RAKE
+    # kw_extractor = Rake()
+    # keywords_rake = kw_extractor.apply(input_text)
+    # keywords_rake = dict(keywords_rake)
+    # YAKE
+    kw_extractor = yake.KeywordExtractor(lan="en", dedupLim=0.9, n=3)
+    keywords_yake = kw_extractor.extract_keywords(input_text)
+    # reorder scores so that higher is better
+    keywords_yake = {keyword[0].lower(): (1 - keyword[1]) for keyword in keywords_yake}
+    keywords_yake = dict(
+        sorted(keywords_yake.items(), key=lambda x: x[1], reverse=True)  # type hinting YAKE is miserable
+        )
+    # Merge RAKE and YAKE based on scores.
+    # keywords_merged = _merge_on_scores(
+    #     list(keywords_yake.keys()),
+    #     list(keywords_rake.keys()),
+    #     list(keywords_yake.values()),
+    #     list(keywords_rake.values()),
+    #     a_weight=0.5,
+    #     top_k=top_k
+    # )
+    # return (list(keywords_rake.keys())[:top_k], list(keywords_yake.keys())[:top_k], keywords_merged)
+    return ", ".join(keywords_yake)  # kinda regretting forcing this into a string
+class KeywordMetadataAdder(MetadataAdder):
+    """Adds keyword metadata to a document.
+    Args:
+        metadata_name: The name of the metadata to add to the document. Defaults to 'keyword_metadata'.
+        keywords_function: A function for keywords, given a source string and the number of keywords to get.
+    """
+    keywords_function: Callable[[str, int], str] = Field(
+        description="The function to use to extract keywords from the text. Input is string and number of keywords to extract. Ouptut is string of keywords.",
+        default=get_keywords,
+    )
+    num_keywords: int = Field(
+        default=5,
+        description="The number of keywords to extract from the text. Defaults to 5.",
+    )
+    def __init__(
+        self,
+        metadata_name: str = "keyword_metadata",
+        keywords_function: Callable[[str], str] = get_keywords,
+        num_keywords: int = 5,
+        **kwargs: Any,
+    ) -> None:
+        """Init params."""
+        super().__init__(metadata_name=metadata_name, keywords_function=keywords_function, num_keywords=num_keywords, **kwargs)  # ah yes i love oop :)
+    @classmethod
+    def class_name(cls) -> str:
+        return "KeywordMetadataAdder"
+    def get_node_metadata(self, node: BaseNode) -> str | None:
+        if not hasattr(node, "text") or node.text is None:
+            return None
+        return self.keywords_function(node.get_content(), self.num_keywords)

merger.py ADDED Viewed

	@@ -0,0 +1,174 @@

+#####################################################
+### DOCUMENT PROCESSOR [MERGER]
+#####################################################
+# Jonathan Wang
+# ABOUT:
+# This project creates an app to chat with PDFs.
+# This is the MERGER
+# which defines how two lists with scores
+# should be merged together into one list.
+# (Useful for fusing things like keywords or textnodes)
+#####################################################
+## TODOS:
+# We're looping through A/B more than necessary.
+#####################################################
+## IMPORTS:
+from __future__ import annotations
+from typing import TYPE_CHECKING, Sequence, TypeVar, Union
+import numpy as np
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+#####################################################
+## CODE:
+GenericType = TypeVar("GenericType")
+### TODO(Jonathan Wang): Implement Maximum Marginal Relevance (MMR)
+# https://en.wikipedia.org/wiki/Maximum_marginal_relevance
+# def mmr(documents, query, scores, lambda_param=0.5):
+#     """
+#     Calculate Maximum Marginal Relevance (MMR) for a list of documents.
+#     Parameters:
+#     documents (list of np.array): List of document vectors.
+#     query (np.array): Query vector.
+#     scores (list of float): Relevance scores for each document.
+#     lambda_param (float): Trade-off parameter between relevance and diversity.
+#     Returns:
+#     list of int: Indices of selected documents in order of selection.
+#     """
+#     selected = []
+#     remaining = list(range(len(documents)))
+#     while remaining:
+#         if not selected:
+#             # Select the document with the highest relevance score
+#             idx = np.argmax(scores)
+#         else:
+#             # Calculate MMR for remaining documents
+#             mmr_scores = []
+#             for i in remaining:
+#                 relevance = scores[i]
+#                 diversity = max([np.dot(documents[i], documents[j]) for j in selected])
+#                 mmr_score = lambda_param * relevance - (1 - lambda_param) * diversity
+#                 mmr_scores.append(mmr_score)
+#             idx = remaining[np.argmax(mmr_scores)]
+#         selected.append(idx)
+#         remaining.remove(idx)
+#     return selected
+def _merge_on_scores(
+    a_list: Sequence[GenericType],
+    b_list: Sequence[GenericType],
+    a_scores_input: Sequence[float | np.float64 | None],
+    b_scores_input: Sequence[float | np.float64 | None],
+    use_distribution: bool = True,
+    a_weight: float = 0.5,
+    top_k: int = 5,
+) -> Sequence[GenericType]:
+    """
+    Given two lists of elements with scores, fuse them together using "Distribution-Based Score Fusion".
+    Elements which have high scores in both lists are given even higher ranking here.
+    Inputs:
+        a_list: list of elements for A
+        a_scores: list of scores for each element in A. Assume higher is better. Share the same index.
+        b_list: list of elements for B
+        b_scores: list of scores for each element in B. Assume higher is better. Share the same index.
+        use_distribution: Whether to fuse using Min-Max Scaling (FALSE) or Distribution Based Score Fusion (TRUE)
+    Outputs:
+        List: List of elements that passed the merge.
+    """
+    # Guard Clauses
+    if ((len(a_list) != len(a_scores_input)) or (len(b_list) != len(b_scores_input))):
+        msg = (
+            f"""_merge_on_scores: Differing number of elements and scores!
+a_list: {a_list}
+a_scores: {a_scores_input}
+b_list: {b_list}
+b_scores: {b_scores_input}
+"""
+        )
+        raise ValueError(msg)
+    if (a_weight > 1 or a_weight < 0):
+        msg = "_merge_on_scores: weight for the A list should be between 0 and 1."
+        raise ValueError(msg)
+    if (top_k < 0): # or top_k > :
+        # TODO(Jonathan Wang): Find a nice way to get the number of unique elements in a list
+        # where those elements are potentially unhashable AND unorderable.
+        # I know about the n^2 solution with two lists and (if not in x), but it's a bit annoying.
+        msg = "_merge_on_scores: top_k must be between 0 and the total number of elements."
+        raise ValueError(msg)
+    # 0. Convert to numpy arrays
+    # NOTE: When using a SubQuestionQueryEngine, the subanswers are saved as NodesWithScores, but their score is None.
+    # We want to filter these out, so we get citations when the two texts are very similar.
+    a_scores: NDArray[np.float64] = np.array(a_scores_input, dtype=np.float64)
+    b_scores: NDArray[np.float64] = np.array(b_scores_input, dtype=np.float64)
+    # 1. Calculate mean of scores.
+    a_mean = np.nanmean(a_scores)  # np.nan if empty
+    b_mean = np.nanmean(b_scores)
+    # 2. Calculate standard deviations
+    a_stdev = np.nanstd(a_scores)
+    b_stdev = np.nanstd(b_scores)
+    # 3. Get minimum and maximum bands as 3std from mean
+    # alternatively, use actual min-max scaling
+    a_min = a_mean - 3 * a_stdev if use_distribution else np.nanmin(a_scores)
+    a_max = a_mean + 3 * a_stdev if use_distribution else np.nanmax(a_scores)
+    b_min = b_mean - 3 * b_stdev if use_distribution else np.nanmin(b_scores)
+    b_max = b_mean + 3 * b_stdev if use_distribution else np.nanmax(b_scores)
+    # 4. Rescale the distributions
+    if (a_max > a_min):
+        a_scores = np.array([
+            ((x - a_min) / (a_max - a_min))
+            for x in a_scores
+        ], dtype=np.float64)
+    if (b_max > b_min):
+        b_scores = np.array([
+            (x - b_min) / (b_max - b_min)
+            for x in b_scores
+        ], dtype=np.float64)
+    # 5. Fuse the scores together
+    full_dict: list[tuple[GenericType, float]] = []
+    for index, element in enumerate(a_list):
+        a_score = a_scores[index]
+        if (element in b_list):
+            # In both A and B. Fuse score.
+            b_score = b_scores[b_list.index(element)]
+            fused_score = a_weight * a_score + (1-a_weight) * b_score
+            full_dict.append((element, fused_score))
+        else:
+            # Only in A.
+            full_dict.append((element, a_weight * a_score))
+    for index, element in enumerate(b_list):
+        if (element not in a_list):
+            b_score = b_scores[index]
+            full_dict.append((element, (1-a_weight) * b_score))
+    full_dict = sorted(full_dict, key=lambda item: item[1], reverse=True)
+    output_list = [item[0] for item in full_dict]
+    if (top_k >= len(full_dict)):
+        return output_list
+    # create final response object
+    return output_list[:top_k]

metadata_adder.py ADDED Viewed

	@@ -0,0 +1,280 @@

+#####################################################
+### DOCUMENT PROCESSOR [Metadata Adders]
+#####################################################
+### Jonathan Wang
+# ABOUT:
+# This creates an app to chat with PDFs.
+# This is the Metadata Adders
+# Which are classes that add metadata fields to documents.
+# This often is used for summaries or keywords.
+#####################################################
+### TODO Board:
+# Seems like this overlaps well with the `metadata extractors` interface from llama_index.
+# These are TransformComponents which take a Sequence of Nodes as input, and returns a list of Dicts as output (with the dicts storing metdata for each node).
+# We should add a wrapper which adds this metadata to nodes.
+# We should also add a wrapper
+# https://github.com/run-llama/llama_index/blob/be3bd619ec114d26cf328d12117c033762695b3f/llama-index-core/llama_index/core/extractors/interface.py#L21
+# https://github.com/run-llama/llama_index/blob/be3bd619ec114d26cf328d12117c033762695b3f/llama-index-core/llama_index/core/extractors/metadata_extractors.py#L332
+#####################################################
+### PROGRAM SETTINGS
+#####################################################
+### PROGRAM IMPORTS
+from __future__ import annotations
+import logging
+import re
+from abc import abstractmethod
+from typing import Any, List, Optional, TypeVar, Sequence
+from llama_index.core.bridge.pydantic import Field, PrivateAttr
+from llama_index.core.schema import BaseNode, TransformComponent
+# Own modules
+#####################################################
+### CONSTANTS
+# ah how beautiful the regex
+# handy visualizer and checker: https://www.debuggex.com/, https://www.regexpr.com/
+logger = logging.getLogger(__name__)
+GenericNode = TypeVar("GenericNode", bound=BaseNode)
+DATE_REGEX = re.compile(r"(?:(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?\s+(?:of\s+)?(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)|(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)\s+(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?)(?:\,)?\s*(?:\d{4})?|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4}", re.IGNORECASE)
+TIME_REGEX = re.compile(r"\d{1,2}:\d{2} ?(?:[ap]\.?m\.?)?|\d[ap]\.?m\.?", re.IGNORECASE)
+EMAIL_REGEX = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)")
+PHONE_REGEX = re.compile(r"((?:(?<![\d-])(?:\+?\d{1,3}[-.\s*]?)?(?:\(?\d{3}\)?[-.\s*]?)?\d{3}[-.\s*]?\d{4}(?![\d-]))|(?:(?<![\d-])(?:(?:\(\+?\d{2}\))|(?:\+?\d{2}))\s*\d{2}\s*\d{3}\s*\d{4}(?![\d-])))")
+MAIL_ADDR_REGEX = re.compile(r"\d{1,4}.{1,10}[\w\s]{1,20}[\s]+(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|parkway|pkwy|circle|cir|boulevard|blvd)\W?(?=\s|$)", re.IGNORECASE)
+# DEFAULT_NUM_WORKERS = os.cpu_count() - 1 if os.cpu_count() else 1  # type: ignore
+#####################################################
+### SCRIPT
+class MetadataAdder(TransformComponent):
+    """Adds metadata to a node.
+    Args:
+        metadata_name: The name of the metadata to add to the node. Defaults to 'metadata'.
+        # num_workers: The number of workers to use for parallel processing. By default, use all available cores minus one. currently WIP.
+    """
+    metadata_name: str = Field(
+        default="metadata",
+        description="The name of the metadata field to add to the document. Defaults to 'metadata'.",
+    )
+    # num_workers: int = Field(
+    #     default=DEFAULT_NUM_WORKERS,
+    #     description="The number of workers to use for parallel processing. By default, use all available cores minus one.",
+    # )
+    def __init__(
+        self, metadata_name: str = "metadata", **kwargs: Any
+    ) -> None:
+        super().__init__(**kwargs)
+        self.metadata_name = metadata_name
+        # self.num_workers = num_workers
+    @classmethod
+    def class_name(cls) -> str:
+        return "MetadataAdder"
+    @abstractmethod
+    def get_node_metadata(self, node: BaseNode) -> str | None:
+        """Given a node, get the metadata for the node."""
+    def add_node_metadata(self, node: GenericNode, metadata_value: Any | None) -> GenericNode:
+        """Given a node and the metadata, add the metadata to the node's `metadata_name` field."""
+        if (metadata_value is None):
+            return node
+        else:
+            node.metadata[self.metadata_name] = metadata_value
+        return node
+    def process_nodes(self, nodes: list[GenericNode]) -> list[GenericNode]:
+        """Process the list of nodes. This gets called by __call__.
+        Args:
+            nodes (List[GenericNode]): The nodes to process.
+        Returns:
+            List[GenericNode]: The processed nodes, with metadata field metadata_name added.
+        """
+        output_nodes = []
+        for node in nodes:
+            node_metadata = self.get_node_metadata(node)
+            node_with_metadata = self.add_node_metadata(node, node_metadata)
+            output_nodes.append(node_with_metadata)
+        return(output_nodes)
+    def __call__(self, nodes: Sequence[BaseNode], **kwargs: Any) -> list[BaseNode]:
+        """Check whether nodes have the specified regex pattern."""
+        return self.process_nodes(nodes)
+class RegexMetadataAdder(MetadataAdder):
+    """Adds regex metadata to a document.
+    Args:
+        regex_pattern: The regex pattern to search for.
+        metadata_name: The name of the metadata to add to the document. Defaults to 'regex_metadata'.
+        # num_workers: The number of workers to use for parallel processing. By default, use all available cores minus one.
+    """
+    _regex_pattern: re.Pattern = PrivateAttr()
+    _boolean_mode: bool = PrivateAttr()
+    # num_workers: int = Field(
+    #     default=DEFAULT_NUM_WORKERS,
+    #     description="The number of workers to use for parallel processing. By default, use all available cores minus one.",
+    # )
+    def __init__(
+        self,
+        regex_pattern: re.Pattern | str = DATE_REGEX,
+        metadata_name: str = "regex_metadata",
+        boolean_mode: bool = False,
+        # num_workers: int = DEFAULT_NUM_WORKERS,
+        **kwargs: Any,
+    ) -> None:
+        """Init params."""
+        if (isinstance(regex_pattern, str)):
+            regex_pattern = re.compile(regex_pattern)
+        # self.num_workers = num_workers
+        super().__init__(metadata_name=metadata_name, **kwargs)  # ah yes i love oop :)
+        self._regex_pattern=regex_pattern
+        self._boolean_mode=boolean_mode
+    @classmethod
+    def class_name(cls) -> str:
+        return "RegexMetadataAdder"
+    def get_node_metadata(self, node: BaseNode) -> str | None:
+        """Given a node with text, return the regex match if it exists.
+        Args:
+            node (BaseNode): The base node to extract from.
+        Returns:
+            Optional[str]: The regex match if it exists. If not, return None.
+        """
+        if (getattr(node, "text", None) is None):
+            return None
+        if (self._boolean_mode):
+            return str(self._regex_pattern.match(node.text) is not None)
+        else:
+            return str(self._regex_pattern.findall(node.text))  # NOTE: we are saving these as a string'd list since this is easier
+class ModelMetadataAdder(MetadataAdder):
+    """Adds metadata to nodes based on a language model."""
+    prompt_template: str = Field(
+        description="The prompt to use to generate the metadata. Defaults to DEFAULT_SUMMARY_TEMPLATE.",
+    )
+    def __init__(
+        self,
+        metadata_name: str,
+        prompt_template: str | None = None,
+        **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(metadata_name=metadata_name, prompt_template=prompt_template, **kwargs)
+    @classmethod
+    def class_name(cls) -> str:
+        return "ModelMetadataAdder"
+    @abstractmethod
+    def get_node_metadata(self, node: BaseNode) -> str | None:
+        """Given a node, get the metadata for the node.
+        Args:
+            node (BaseNode): The node to add metadata to.
+        Returns:
+            Optional[str]: The metadata if it exists. If not, return None.
+        """
+class UnstructuredPDFPostProcessor(TransformComponent):
+    """Handles postprocessing of PDF which was read in using UnstructuredIO."""
+    ### NOTE: okay technically we could have done this in the IngestionPipeline abstraction. Maybe we integrate in the future?
+    # This component doesn't play nice with multi-processing due to having non-async LLMs.
+    # _embed_model: Optional[BaseEmbedding] = PrivateAttr()
+    _metadata_adders: list[MetadataAdder] = PrivateAttr()
+    def __init__(
+        self,
+        # embed_model: Optional[BaseEmbedding] = None,
+        metadata_adders: list[MetadataAdder] | None = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(**kwargs)
+        # self._embed_model = embed_model or Settings.embed_model
+        self._metadata_adders = metadata_adders or []
+    @classmethod
+    def class_name(cls) -> str:
+        return "UnstructuredPDFPostProcessor"
+    # def _apply_embed_model(self, nodes: List[BaseNode]) -> List[BaseNode]:
+    #     if (self._embed_model is not None):
+    #         nodes = self._embed_model(nodes)
+    #     return nodes
+    def _apply_metadata_adders(self, nodes: list[GenericNode]) -> list[GenericNode]:
+        for metadata_adder in self._metadata_adders:
+            nodes = metadata_adder(nodes)
+        return nodes
+    def __call__(self, nodes: list[GenericNode], **kwargs: Any) -> Sequence[BaseNode]:
+        return self._apply_metadata_adders(nodes)
+        # nodes = self._apply_embed_model(nodes)  # this goes second in case we want to embed the metadata.
+# def has_email(input_text: str) -> bool:
+#     """
+#     Given a chunk of text, determine whether it has an email address or not.
+#     We're using the long complex email regex from https://emailregex.com/index.html
+#     """
+#     return (EMAIL_REGEX.search(input_text) is not None)
+# def has_phone(input_text: str) -> bool:
+#     """
+#     Given a chunk of text, determine whether it has a phone number or not.
+#     """
+#     has_phone = PHONE_REGEX.search(input_text)
+#     return (has_phone is not None)
+# def has_mail_addr(input_text: str) -> bool:
+#     """
+#     Given a chunk of text, determine whether it has a mailing address or not.
+#     NOTE: This is difficult to do with regex.
+#         ... We could use spacy's English language NER model instead / as well:
+#         Assume that addresses will have a GSP (geospatial political) or GPE (geopolitical entity).
+#         DOCS SEE: https://www.nltk.org/book/ch07.html | https://spacy.io/usage/linguistic-features
+#     """
+#     has_addr = MAIL_ADDR_REGEX.search(input_text)
+#     return (has_addr is not None)
+# def has_date(input_text: str) -> bool:
+#     """
+#     Given a chunk of text, determine whether it has a date or not.
+#     NOTE: relative dates are stuff like "within 30 days"
+#     """
+#     has_date = DATE_REGEX.search(input_text)
+#     return (has_date is not None)

models.py ADDED Viewed

	@@ -0,0 +1,785 @@

+#####################################################
+### DOCUMENT PROCESSOR [MODELS]
+#####################################################
+# Jonathan Wang
+# ABOUT:
+# This project creates an app to chat with PDFs.
+# This is the LANGUAGE MODELS
+# that are used in the document reader.
+#####################################################
+## TODOS:
+# <!> Add support for vLLM / AWQ / GPTQ models. (probably not going to be done due to lack of attention scores)
+# Add KTransformers backend?
+# https://github.com/kvcache-ai/ktransformers
+# https://github.com/Tada-AI/pdf_parser
+#####################################################
+## IMPORTS:
+from __future__ import annotations
+import gc
+import logging
+import sys
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Protocol,
+    Sequence,
+    Union,
+    cast,
+    runtime_checkable,
+)
+import streamlit as st
+import torch
+from llama_index.core.base.embeddings.base import BaseEmbedding
+from llama_index.core.base.llms.base import BaseLLM
+from llama_index.core.base.llms.generic_utils import (
+    messages_to_prompt as generic_messages_to_prompt,
+)
+from llama_index.core.base.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseGen,
+    LLMMetadata,
+    MessageRole,
+)
+from llama_index.core.bridge.pydantic import Field, PrivateAttr
+from llama_index.core.callbacks import CallbackManager
+from llama_index.core.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
+from llama_index.core.llms.callbacks import (
+    llm_chat_callback,
+    llm_completion_callback,
+)
+from llama_index.core.multi_modal_llms import MultiModalLLM
+from llama_index.core.postprocessor import SentenceTransformerRerank
+from llama_index.core.prompts.base import PromptTemplate
+from llama_index.core.schema import ImageDocument, ImageNode
+from llama_index.core.types import BaseOutputParser, PydanticProgramMode
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.llms.huggingface import HuggingFaceLLM
+from PIL import Image as PILImage
+from transformers import (
+    AutoImageProcessor,
+    AutoModelForVision2Seq,
+    AutoTokenizer,
+    LogitsProcessor,
+    QuantoConfig,
+    StoppingCriteria,
+    StoppingCriteriaList,
+)
+from typing_extensions import Annotated
+# from wtpsplit import SaT  # Sentence segmentation model. Dropping this. Requires adapters=0.2.1->Transformers=4.39.3 | Phi3 Vision requires Transformers 4.40.2
+## NOTE: Proposal for LAZY LOADING packages for running LLMS:
+# Currently not done because empahsis is on local inference w/ ability to get Attention Scores, which is not yet supported in non-HF Transformers methods.
+## LLamacpp:
+# from llama_index.llms.llama_cpp import LlamaCPP
+# from llama_index.llms.llama_cpp.llama_utils import (
+#     messages_to_prompt,
+#     completion_to_prompt
+# )
+## HF Transformers LLM:
+# from transformers import AutoTokenizer, BitsAndBytesConfig
+# from llama_index.llms.huggingface import HuggingFaceLLM
+## GROQ
+# from llama_index.llms.groq import Groq
+#####################################################
+### SETTINGS:
+DEFAULT_HF_MULTIMODAL_LLM = "Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"
+DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW = 1024
+DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS = 1024
+#####################################################
+### CODE:
+logger = logging.getLogger(__name__)
+@st.cache_resource
+def get_embedder(
+    model_path: str = "mixedbread-ai/mxbai-embed-large-v1",
+    device: str = "cuda",  # 'cpu' is unbearably slow
+) -> BaseEmbedding:
+    """Given the path to an embedding model, load it."""
+    # NOTE: okay we definitely could have not made this wrapper, but shrug
+    return HuggingFaceEmbedding(
+        model_path,
+        device=device
+    )
+@st.cache_resource
+def get_reranker(
+    model_path: str = "mixedbread-ai/mxbai-rerank-large-v1",
+    top_n: int = 3,
+    device: str = "cpu",  # 'cuda' if we were rich
+) -> SentenceTransformerRerank:  # technically this is a BaseNodePostprocessor, but that seems too abstract.
+    """Given the path to a reranking model, load it."""
+    # NOTE: okay we definitely could have not made this wrapper, but shrug
+    return SentenceTransformerRerank(
+        model=model_path,
+        top_n=top_n,
+        device=device
+    )
+## LLM Options Below
+# def _get_llamacpp_llm(
+#     model_path: str,
+#     model_seed: int = 31415926,
+#     model_temperature: float = 1e-64,  # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min']
+#     model_context_length: Optional[int] = 8192,
+#     model_max_new_tokens: Optional[int] = 1024,
+# ) -> BaseLLM:
+#     """Load a LlamaCPP model using GPU and other sane defaults."""
+#     # Lazy Loading
+#     from llama_index.llms.llama_cpp import LlamaCPP
+#     from llama_index.llms.llama_cpp.llama_utils import (
+#         messages_to_prompt,
+#         completion_to_prompt
+#     )
+#     # Arguments to Pass
+#     llm = LlamaCPP(
+#         model_path=model_path,
+#         temperature=model_temperature,
+#         max_new_tokens=model_max_new_tokens,
+#         context_window=model_context_length,
+#         # kwargs to pass to __call__()
+#         generate_kwargs={'seed': model_seed}, # {'temperature': TEMPERATURE, 'top_p':0.7, 'min_p':0.1, 'seed': MODEL_SEED},
+#         # kwargs to pass to __init__()
+#         # set to at least 1 to use GPU
+#         model_kwargs={'n_gpu_layers': -1, 'n_threads': os.cpu_count()-1}, #, 'rope_freq_scale': 0.83, 'rope_freq_base': 20000},
+#         # transform inputs into model format
+#         messages_to_prompt=messages_to_prompt,
+#         completion_to_prompt=completion_to_prompt,
+#         verbose=True,
+#     )
+#     return (llm)
+@st.cache_resource
+def _get_hf_llm(
+    model_path: str,
+    model_temperature: float = sys.float_info.min,  # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min'] to confirm (?)
+    model_context_length: int | None = 16384,
+    model_max_new_tokens: int | None = 2048,
+    hf_quant_level: int | None = 8,
+) -> BaseLLM:
+    """Load a Huggingface-Transformers based model using sane defaults."""
+    # Fix temperature if needed; HF implementation complains about it being zero
+    model_temperature = max(sys.float_info.min, model_temperature)
+    # Get Quantization with BitsandBytes
+    quanto_config = None  # NOTE: by default, no quantization.
+    if (hf_quant_level == 4):
+        # bnb_config = BitsAndBytesConfig(
+        #     # load_in_8bit=True,
+        #     load_in_4bit=True,
+        #     # bnb_4bit_use_double_quant=True,
+        #     bnb_4bit_quant_type="nf4",
+        #     bnb_4bit_compute_dtype='bfloat16',  # NOTE: Tesla T4 GPUs are too crappy for bfloat16
+        #     # bnb_4bit_compute_dtype='float16'
+        # )
+        quanto_config = QuantoConfig(
+            weights="int4"  # there's also 'int2' if you're crazy...
+        )
+    elif (hf_quant_level == 8):
+        # bnb_config = BitsAndBytesConfig(
+        #     load_in_8bit=True
+        # )
+        quanto_config = QuantoConfig(
+            weights="int8"
+        )
+    # Get Stopping Tokens for Llama3 based models, because they're /special/ and added a new one.
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path
+    )
+    stopping_ids = [
+        tokenizer.eos_token_id,
+        tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+    ]
+    return HuggingFaceLLM(
+        model_name=model_path,
+        tokenizer_name=model_path,
+        stopping_ids=stopping_ids,
+        max_new_tokens=model_max_new_tokens or DEFAULT_NUM_OUTPUTS,
+        context_window=model_context_length or DEFAULT_CONTEXT_WINDOW,
+        tokenizer_kwargs={"trust_remote_code": True},
+        model_kwargs={"trust_remote_code": True, "quantization_config": quanto_config},
+        generate_kwargs={
+            "do_sample": not model_temperature > sys.float_info.min,
+            "temperature": model_temperature,
+        },
+        is_chat_model=True,
+    )
+@st.cache_resource
+def get_llm(
+    model_path: str = "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    model_temperature: float = 0,  # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min']
+    model_context_length: int | None = 8192,
+    model_max_new_tokens: int | None = 1024,
+    hf_quant_level: int | None = 8,  # 4-bit / 8-bit loading for HF models
+) -> BaseLLM:
+    """
+    Given the path to a LLM, determine the type, load it in and convert it into a Llamaindex-compatable LLM.
+    NOTE: I chose to set some "sane" defaults, so it's probably not as flexible as some other dev would like.
+    """
+    # if (model_path_extension == ".gguf"):
+    #     ##### LLAMA.CPP
+    #     return(_get_llamacpp_llm(model_path, model_seed, model_temperature, model_context_length, model_max_new_tokens))
+    # TODO(Jonathan Wang): Consider non-HF-Transformers backends
+    # vLLM support for AWQ/GPTQ models
+    # I guess reluctantly AutoAWQ and AutoGPTQ packages.
+    # Exllamav2 is kinda dead IMO.
+    # else:
+        #### No extension or weird fake extension suggests a folder, i.e., the base model from HF
+    return(_get_hf_llm(model_path=model_path, model_temperature=model_temperature, model_context_length=model_context_length, model_max_new_tokens=model_max_new_tokens, hf_quant_level=hf_quant_level))
+# @st.cache_resource
+# def get_llm() -> BaseLLM:
+#     from llama_index.llms.groq import Groq
+#     llm = Groq(
+#         model='llama-3.1-8b-instant',  # old: 'llama3-8b-8192'
+#         api_key=os.environ.get('GROQ_API_KEY'),
+#     )
+#     return (llm)
+class EosLogitProcessor(LogitsProcessor):
+    """Special snowflake processor for Salesforce Vision Model."""
+    def __init__(self, eos_token_id: int, end_token_id: int):
+        super().__init__()
+        self.eos_token_id = eos_token_id
+        self.end_token_id = end_token_id
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if input_ids.size(1) > 1: # Expect at least 1 output token.
+            forced_eos = torch.full((scores.size(1),), -float("inf"), device=input_ids.device)
+            forced_eos[self.eos_token_id] = 0
+            # Force generation of EOS after the <|end|> token.
+            scores[input_ids[:, -1] == self.end_token_id] = forced_eos
+        return scores
+# NOTE: These two protocols are needed to appease mypy
+# https://github.com/run-llama/llama_index/blob/5238b04c183119b3035b84e2663db115e63dcfda/llama-index-core/llama_index/core/llms/llm.py#L89
+@runtime_checkable
+class MessagesImagesToPromptType(Protocol):
+    def __call__(self, messages: Sequence[ChatMessage], images: Sequence[ImageDocument], **kwargs: Any) -> str:
+        pass
+MessagesImagesToPromptCallable = Annotated[
+    Optional[MessagesImagesToPromptType],
+    WithJsonSchema({"type": "string"}),
+]
+# https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5/blob/main/batch_inference.ipynb
+class HuggingFaceMultiModalLLM(MultiModalLLM):
+    """Supposed to be a wrapper around HuggingFace's Vision LLMS.
+    Currently only supports one model type: Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5
+    """
+    model_name: str = Field(
+        description='The multi-modal huggingface LLM to use. Currently only using Phi3.',
+        default=DEFAULT_HF_MULTIMODAL_LLM
+    )
+    context_window: int = Field(
+        default=DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW,
+        description="The maximum number of tokens available for input.",
+        gt=0,
+    )
+    max_new_tokens: int = Field(
+        default=DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS,
+        description="The maximum number of tokens to generate.",
+        gt=0,
+    )
+    system_prompt: str = Field(
+        default="",
+        description=(
+            "The system prompt, containing any extra instructions or context. "
+            "The model card on HuggingFace should specify if this is needed."
+        ),
+    )
+    query_wrapper_prompt: PromptTemplate = Field(
+        default=PromptTemplate("{query_str}"),
+        description=(
+            "The query wrapper prompt, containing the query placeholder. "
+            "The model card on HuggingFace should specify if this is needed. "
+            "Should contain a `{query_str}` placeholder."
+        ),
+    )
+    tokenizer_name: str = Field(
+        default=DEFAULT_HF_MULTIMODAL_LLM,
+        description=(
+            "The name of the tokenizer to use from HuggingFace. "
+            "Unused if `tokenizer` is passed in directly."
+        ),
+    )
+    processor_name: str = Field(
+        default=DEFAULT_HF_MULTIMODAL_LLM,
+        description=(
+            "The name of the processor to use from HuggingFace. "
+            "Unused if `processor` is passed in directly."
+        ),
+    )
+    device_map: str = Field(
+        default="auto", description="The device_map to use. Defaults to 'auto'."
+    )
+    stopping_ids: list[int] = Field(
+        default_factory=list,
+        description=(
+            "The stopping ids to use. "
+            "Generation stops when these token IDs are predicted."
+        ),
+    )
+    tokenizer_outputs_to_remove: list = Field(
+        default_factory=list,
+        description=(
+            "The outputs to remove from the tokenizer. "
+            "Sometimes huggingface tokenizers return extra inputs that cause errors."
+        ),
+    )
+    tokenizer_kwargs: dict = Field(
+        default_factory=dict, description="The kwargs to pass to the tokenizer."
+    )
+    processor_kwargs: dict = Field(
+        default_factory=dict, description="The kwargs to pass to the processor."
+    )
+    model_kwargs: dict = Field(
+        default_factory=dict,
+        description="The kwargs to pass to the model during initialization.",
+    )
+    generate_kwargs: dict = Field(
+        default_factory=dict,
+        description="The kwargs to pass to the model during generation.",
+    )
+    is_chat_model: bool = Field(
+        default=False,
+        description=(
+            "Whether the model can have multiple messages passed at once, like the OpenAI chat API."
+            # LLMMetadata.__fields__["is_chat_model"].field_info.description
+            # + " Be sure to verify that you either pass an appropriate tokenizer "
+            # "that can convert prompts to properly formatted chat messages or a "
+            # "`messages_to_prompt` that does so."
+        ),
+    )
+    messages_images_to_prompt: MessagesImagesToPromptCallable = Field(
+        default=generic_messages_to_prompt,
+        description="A function that takes in a list of messages and images and returns a prompt string.",
+    )
+    _model: Any = PrivateAttr()
+    _tokenizer: Any = PrivateAttr()
+    # TODO(Jonathan Wang): We need to add a separate field for AutoProcessor as opposed to ImageProcessors.
+    _processor: Any = PrivateAttr()
+    _stopping_criteria: Any = PrivateAttr()
+    def __init__(
+        self,
+        context_window: int = DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW,
+        max_new_tokens: int = DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS,
+        query_wrapper_prompt: Union[str, PromptTemplate] = "{query_str}",
+        tokenizer_name: str = DEFAULT_HF_MULTIMODAL_LLM,
+        processor_name: str = DEFAULT_HF_MULTIMODAL_LLM,
+        model_name: str = DEFAULT_HF_MULTIMODAL_LLM,
+        model: Any | None = None,
+        tokenizer: Any | None = None,
+        processor: Any | None = None,
+        device_map: str = "auto",
+        stopping_ids: list[int] | None = None,
+        tokenizer_kwargs: dict[str, Any] | None = None,
+        processor_kwargs: dict[str, Any] | None = None,
+        tokenizer_outputs_to_remove: list[str] | None = None,
+        model_kwargs: dict[str, Any] | None = None,
+        generate_kwargs: dict[str, Any] | None = None,
+        is_chat_model: bool = False,
+        callback_manager: CallbackManager | None = None,
+        system_prompt: str = "",
+        messages_images_to_prompt: Callable[[Sequence[ChatMessage], Sequence[ImageDocument]], str] | None = None,
+        # completion_to_prompt: Callable[[str], str] | None = None,
+        # pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        # output_parser: BaseOutputParser | None = None,
+    ) -> None:
+        logger.info(f"CUDA Memory Pre-AutoModelForVision2Seq: {torch.cuda.mem_get_info()}")
+        # Salesforce one is a AutoModelForVision2Seq, but not AutoCausalLM which is more common.
+        model = model or AutoModelForVision2Seq.from_pretrained(
+            model_name,
+            device_map=device_map,
+            trust_remote_code=True,
+            **(model_kwargs or {})
+        )
+        logger.info(f"CUDA Memory Post-AutoModelForVision2Seq: {torch.cuda.mem_get_info()}")
+        # check context_window
+        config_dict = model.config.to_dict()
+        model_context_window = int(
+            config_dict.get("max_position_embeddings", context_window)
+        )
+        if model_context_window < context_window:
+            logger.warning(
+                f"Supplied context_window {context_window} is greater "
+                f"than the model's max input size {model_context_window}. "
+                "Disable this warning by setting a lower context_window."
+            )
+            context_window = model_context_window
+        processor_kwargs = processor_kwargs or {}
+        if "max_length" not in processor_kwargs:
+            processor_kwargs["max_length"] = context_window
+        # NOTE: Sometimes models (phi-3) will use AutoProcessor and include the tokenizer within it.
+        logger.info(f"CUDA Memory Pre-Processor: {torch.cuda.mem_get_info()}")
+        processor = processor or AutoImageProcessor.from_pretrained(
+            processor_name or model_name,
+            trust_remote_code=True,
+            **processor_kwargs
+        )
+        logger.info(f"CUDA Memory Post-Processor: {torch.cuda.mem_get_info()}")
+        tokenizer = tokenizer or AutoTokenizer.from_pretrained(
+            tokenizer_name or model_name,
+            trust_remote_code=True,
+            **(tokenizer_kwargs or {})
+        )
+        logger.info(f"CUDA Memory Post-Tokenizer: {torch.cuda.mem_get_info()}")
+        # Tokenizer-Model disagreement
+        if (hasattr(tokenizer, "name_or_path") and tokenizer.name_or_path != model_name):  # type: ignore (checked for attribute)
+            logger.warning(
+                f"The model `{model_name}` and processor `{getattr(tokenizer, 'name_or_path', None)}` "
+                f"are different, please ensure that they are compatible."
+            )
+        # Processor-Model disagreement
+        if (hasattr(processor, "name_or_path") and getattr(processor, "name_or_path", None) != model_name):
+            logger.warning(
+                f"The model `{model_name}` and processor `{getattr(processor, 'name_or_path', None)}` "
+                f"are different, please ensure that they are compatible."
+            )
+        # setup stopping criteria
+        stopping_ids_list = stopping_ids or []
+        class StopOnTokens(StoppingCriteria):
+            def __call__(
+                self,
+                input_ids: torch.LongTensor,
+                scores: torch.FloatTensor,
+                **kwargs: Any,
+            ) -> bool:
+                return any(input_ids[0][-1] == stop_id for stop_id in stopping_ids_list)
+        stopping_criteria = StoppingCriteriaList([StopOnTokens()])
+        if isinstance(query_wrapper_prompt, str):
+            query_wrapper_prompt = PromptTemplate(query_wrapper_prompt)
+        messages_images_to_prompt = messages_images_to_prompt or self._processor_messages_to_prompt
+        # Initiate standard LLM
+        super().__init__(
+            callback_manager=callback_manager or CallbackManager([]),
+        )
+        logger.info(f"CUDA Memory Post-SuperInit: {torch.cuda.mem_get_info()}")
+        # Initiate remaining fields
+        self._model = model
+        self._tokenizer = tokenizer
+        self._processor = processor
+        logger.info(f"CUDA Memory Post-Init: {torch.cuda.mem_get_info()}")
+        self._stopping_criteria = stopping_criteria
+        self.model_name = model_name
+        self.context_window=context_window
+        self.max_new_tokens=max_new_tokens
+        self.system_prompt=system_prompt
+        self.query_wrapper_prompt=query_wrapper_prompt
+        self.tokenizer_name=tokenizer_name
+        self.processor_name=processor_name
+        self.model_name=model_name
+        self.device_map=device_map
+        self.stopping_ids=stopping_ids or []
+        self.tokenizer_outputs_to_remove=tokenizer_outputs_to_remove or []
+        self.tokenizer_kwargs=tokenizer_kwargs or {}
+        self.processor_kwargs=processor_kwargs or {}
+        self.model_kwargs=model_kwargs or {}
+        self.generate_kwargs=generate_kwargs or {}
+        self.is_chat_model=is_chat_model
+        self.messages_images_to_prompt=messages_images_to_prompt
+        # self.completion_to_prompt=completion_to_prompt,
+        # self.pydantic_program_mode=pydantic_program_mode,
+        # self.output_parser=output_parser,
+    @classmethod
+    def class_name(cls) -> str:
+        return "HuggingFace_MultiModal_LLM"
+    @property
+    def metadata(self) -> LLMMetadata:
+        """LLM metadata."""
+        return LLMMetadata(
+            context_window=self.context_window,
+            num_output=self.max_new_tokens,
+            model_name=self.model_name,
+            is_chat_model=self.is_chat_model,
+        )
+    def _processor_messages_to_prompt(self, messages: Sequence[ChatMessage], images: Sequence[ImageDocument]) -> str:
+        ### TODO(Jonathan Wang): Make this work generically. Currently we're building for `Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5`
+        """Converts a list of messages into a prompt for the multimodal LLM.
+        NOTE: we assume for simplicity here that these images are related, and not the user bouncing between multiple different topics. Thus, we send them all at once.
+        Args:
+            messages (Sequence[ChatMessage]): A list of the messages to convert, where each message is a dict containing the message role and content.
+            images (Sequence[ImageDocument]): The number of images the user is passing to the MultiModalLLM.
+        Returns:
+            str: The prompt.
+        """
+        # NOTE: For `Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5`, we actually ignore the `images`; no plaaceholders.
+        """Use the tokenizer to convert messages to prompt. Fallback to generic."""
+        if hasattr(self._tokenizer, "apply_chat_template"):
+            messages_dict = [
+                {"role": message.role.value, "content": message.content}
+                for message in messages
+            ]
+            return self._tokenizer.apply_chat_template(
+                messages_dict, tokenize=False, add_generation_prompt=True
+            )
+        return generic_messages_to_prompt(messages)
+    @llm_completion_callback()
+    def complete(
+        self,
+        prompt: str,
+        image_documents: ImageNode | List[ImageNode] | ImageDocument | List[ImageDocument],  # this also takes ImageDocument which inherits from ImageNode.
+        formatted: bool = False,
+        **kwargs: Any
+    ) -> CompletionResponse:
+        """Given a prompt and image node(s), get the Phi-3 Vision prompt"""
+        # Handle images input
+        # https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5/blob/main/demo.ipynb
+        batch_image_list = []
+        batch_image_sizes = []
+        batch_prompt = []
+        # Fix image_documents input typing
+        if (not isinstance(image_documents, list)):
+            image_documents = [image_documents]
+        image_documents = [cast(ImageDocument, image) for image in image_documents]  # we probably won't be using the Document features, so I think this is fine.
+        # Convert input images into PIL images for the model.
+        image_list = []
+        image_sizes = []
+        for image in image_documents:
+            # NOTE: ImageDocument inherets from ImageNode. We'll go extract the image.
+            image_io = image.resolve_image()
+            image_pil = PILImage.open(image_io)
+            image_list.append(self._processor([image_pil], image_aspect_ratio='anyres')['pixel_values'].to(self._model.device))
+            image_sizes.append(image_pil.size)
+        batch_image_list.append(image_list)
+        batch_image_sizes.append(image_sizes)
+        batch_prompt.append(prompt)  # only one question per image
+        # Get the prompt
+        if not formatted and self.query_wrapper_prompt:
+            prompt = self.query_wrapper_prompt.format(query_str=prompt)
+        prompt_sequence = []
+        if self.system_prompt:
+            prompt_sequence.append(ChatMessage(role=MessageRole.SYSTEM, content=self.system_prompt))
+        prompt_sequence.append(ChatMessage(role=MessageRole.USER, content=prompt))
+        prompt = self.messages_images_to_prompt(messages=prompt_sequence, images=image_documents)
+        # Get the model input
+        batch_inputs = {
+            "pixel_values": batch_image_list
+        }
+        language_inputs = self._tokenizer(
+            [prompt],
+            return_tensors="pt",
+            padding='longest',  # probably not needed.
+            max_length=self._tokenizer.model_max_length,
+            truncation=True
+        ).to(self._model.device)
+        # TODO: why does the example cookbook have this weird conversion to Cuda instead of .to(device)?
+        # language_inputs = {name: tensor.cuda() for name, tensor in language_inputs.items()}
+        batch_inputs.update(language_inputs)
+        gc.collect()
+        torch.cuda.empty_cache()
+        # remove keys from the tokenizer if needed, to avoid HF errors
+        # TODO: this probably is broken and wouldn't work.
+        for key in self.tokenizer_outputs_to_remove:
+            if key in batch_inputs:
+                batch_inputs.pop(key, None)
+        # Get output
+        tokens = self._model.generate(
+            **batch_inputs,
+            image_sizes=batch_image_sizes,
+            pad_token_id=self._tokenizer.pad_token_id,
+            eos_token_id=self._tokenizer.eos_token_id,
+            max_new_tokens=self.max_new_tokens,
+            stopping_criteria=self._stopping_criteria,
+            # NOTE: Special snowflake processor for Salesforce XGEN Phi3 Mini.
+            logits_processor=[EosLogitProcessor(eos_token_id=self._tokenizer.eos_token_id, end_token_id=32007)],
+            **self.generate_kwargs
+        )
+        gc.collect()
+        torch.cuda.empty_cache()
+        # completion_tokens = tokens[:, batch_inputs['input_ids'].shape[1]:]
+        completion = self._tokenizer.batch_decode(
+            tokens,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        gc.collect()
+        torch.cuda.empty_cache()
+        output = CompletionResponse(text=completion, raw={'model_output': tokens})
+        # Clean stuff up
+        del batch_image_list, batch_image_sizes, batch_inputs, tokens, completion
+        gc.collect()
+        torch.cuda.empty_cache()
+        # Return the completion
+        return output
+    @llm_completion_callback()
+    def stream_complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponseGen:
+        raise NotImplementedError
+    @llm_chat_callback()
+    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
+        raise NotImplementedError
+    @llm_chat_callback()
+    def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen:
+        raise NotImplementedError
+    @llm_completion_callback()
+    async def acomplete(
+        self,
+        prompt: str,
+        images: ImageNode | List[ImageNode],  # this also takes ImageDocument which inherits from ImageNode.
+        formatted: bool = False,
+        **kwargs: Any
+    ) -> CompletionResponse:
+        raise NotImplementedError
+    @llm_completion_callback()
+    async def astream_complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponseGen:
+        raise NotImplementedError
+    @llm_chat_callback()
+    async def achat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
+        raise NotImplementedError
+    @llm_chat_callback()
+    async def astream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen:
+        raise NotImplementedError
+# @st.cache_resource()
+# def get_multimodal_llm(**kwargs) -> MultiModalLLM:
+#     vision_llm = OpenAIMultiModal(
+#         model='gpt-4o-mini',
+#         temperature=0,
+#         max_new_tokens=512,
+#         image_detail='auto'
+#     )
+#     return (vision_llm)
+@st.cache_resource
+def get_multimodal_llm(
+    model_name: str = DEFAULT_HF_MULTIMODAL_LLM,
+    device_map: str = "cuda",  # does not support 'auto'
+    processor_kwargs: dict[str, Any] | None = None,
+    model_kwargs: dict[str, Any] | None = None, # {'torch_dtype': torch.bfloat16}, # {'torch_dtype': torch.float8_e5m2}
+    generate_kwargs: dict[str, Any] | None = None,  # from the example cookbook
+    hf_quant_level: int | None = 8,
+) -> HuggingFaceMultiModalLLM:
+    # Get default generate kwargs
+    if model_kwargs is None:
+        model_kwargs = {}
+    if processor_kwargs is None:
+        processor_kwargs = {}
+    if generate_kwargs is None:
+        generate_kwargs = {
+            "temperature": sys.float_info.min,
+            "top_p": None,
+            "num_beams": 1
+            # NOTE: we hack in EOSLogitProcessor in the HuggingFaceMultiModalLLM because it allows us to get the tokenizer.eos_token_id
+        }
+    # Get Quantization with Quanto
+    quanto_config = None  # NOTE: by default, no quantization.
+    if (hf_quant_level == 4):
+        # bnb_config = BitsAndBytesConfig(
+        #     # load_in_8bit=True,
+        #     load_in_4bit=True,
+        #     # bnb_4bit_use_double_quant=True,
+        #     bnb_4bit_quant_type="nf4",
+        #     bnb_4bit_compute_dtype='bfloat16',  # NOTE: Tesla T4 GPUs are too crappy for bfloat16
+        #     # bnb_4bit_compute_dtype='float16'
+        # )
+        quanto_config = QuantoConfig(
+            weights="int4"  # there's also 'int2' if you're crazy...
+        )
+    elif (hf_quant_level == 8):
+        # bnb_config = BitsAndBytesConfig(
+        #     load_in_8bit=True
+        # )
+        quanto_config = QuantoConfig(
+            weights="int8"
+        )
+    if (quanto_config is not None):
+        model_kwargs["quantization_config"] = quanto_config
+    return HuggingFaceMultiModalLLM(
+        model_name=model_name,
+        device_map=device_map,
+        processor_kwargs=processor_kwargs,
+        model_kwargs=model_kwargs,
+        generate_kwargs=generate_kwargs,
+        max_new_tokens=1024  # from the example cookbook
+    )

obs_logging.py ADDED Viewed

	@@ -0,0 +1,380 @@

+#####################################################
+### DOCUMENT PROCESSOR [OBSERVATION/LOGGING]
+#####################################################
+# Jonathan Wang
+# ABOUT:
+# This project creates an app to chat with PDFs.
+# This is the Observation and Logging
+# to see the actions undertaken in the RAG pipeline.
+#####################################################
+## TODOS:
+# Why does FullRAGEventHandler keep producing duplicate output?
+#####################################################
+## IMPORTS:
+from __future__ import annotations
+import logging
+from typing import TYPE_CHECKING, Any, ClassVar, Sequence
+import streamlit as st
+# Callbacks
+from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
+# Pretty Printing
+# from llama_index.core.response.notebook_utils import display_source_node
+# End user handler
+from llama_index.core.instrumentation import get_dispatcher
+from llama_index.core.instrumentation.event_handlers import BaseEventHandler
+from llama_index.core.instrumentation.events.agent import (
+    AgentChatWithStepEndEvent,
+    AgentChatWithStepStartEvent,
+    AgentRunStepEndEvent,
+    AgentRunStepStartEvent,
+    AgentToolCallEvent,
+)
+from llama_index.core.instrumentation.events.chat_engine import (
+    StreamChatDeltaReceivedEvent,
+    StreamChatErrorEvent,
+)
+from llama_index.core.instrumentation.events.embedding import (
+    EmbeddingEndEvent,
+    EmbeddingStartEvent,
+)
+from llama_index.core.instrumentation.events.llm import (
+    LLMChatEndEvent,
+    LLMChatInProgressEvent,
+    LLMChatStartEvent,
+    LLMCompletionEndEvent,
+    LLMCompletionStartEvent,
+    LLMPredictEndEvent,
+    LLMPredictStartEvent,
+    LLMStructuredPredictEndEvent,
+    LLMStructuredPredictStartEvent,
+)
+from llama_index.core.instrumentation.events.query import (
+    QueryEndEvent,
+    QueryStartEvent,
+)
+from llama_index.core.instrumentation.events.rerank import (
+    ReRankEndEvent,
+    ReRankStartEvent,
+)
+from llama_index.core.instrumentation.events.retrieval import (
+    RetrievalEndEvent,
+    RetrievalStartEvent,
+)
+from llama_index.core.instrumentation.events.span import (
+    SpanDropEvent,
+)
+from llama_index.core.instrumentation.events.synthesis import (
+    # GetResponseEndEvent,
+    GetResponseStartEvent,
+    SynthesizeEndEvent,
+    SynthesizeStartEvent,
+)
+from llama_index.core.instrumentation.span import SimpleSpan
+from llama_index.core.instrumentation.span_handlers.base import BaseSpanHandler
+from treelib import Tree
+if TYPE_CHECKING:
+    from llama_index.core.instrumentation.dispatcher import Dispatcher
+    from llama_index.core.instrumentation.events import BaseEvent
+    from llama_index.core.schema import BaseNode, NodeWithScore
+#####################################################
+## Code
+logger = logging.getLogger(__name__)
+@st.cache_resource
+def get_callback_manager() -> CallbackManager:
+    """Create the callback manager for the code."""
+    return CallbackManager([LlamaDebugHandler()])
+def display_source_node(source_node: NodeWithScore, max_length: int = 100) -> str:
+    source_text = source_node.node.get_content().strip()
+    source_text = source_text[:max_length] + "..." if len(source_text) > max_length else source_text
+    return (
+        f"**Node ID:** {source_node.node.node_id}<br>"
+        f"**Similarity:** {source_node.score}<br>"
+        f"**Text:** {source_text}<br>"
+    )
+class RAGEventHandler(BaseEventHandler):
+    """Pruned RAG Event Handler."""
+    # events: List[BaseEvent] = []  # TODO: handle removing historical events if they're too old.
+    @classmethod
+    def class_name(cls) -> str:
+        """Class name."""
+        return "RAGEventHandler"
+    def handle(self, event: BaseEvent, **kwargs: Any) -> None:
+        """Logic for handling event."""
+        print("-----------------------")
+        # all events have these attributes
+        print(event.id_)
+        print(event.timestamp)
+        print(event.span_id)
+        # event specific attributes
+        if isinstance(event, LLMChatStartEvent):
+            # initial
+            print(event.messages)
+            print(event.additional_kwargs)
+            print(event.model_dict)
+        elif isinstance(event, LLMChatInProgressEvent):
+            # streaming
+            print(event.response.delta)
+        elif isinstance(event, LLMChatEndEvent):
+            # final response
+            print(event.response)
+        # self.events.append(event)
+        print("-----------------------")
+class FullRAGEventHandler(BaseEventHandler):
+    """RAG event handler. Built off the example custom event handler.
+    In general, logged events are treated as single events in a point in time,
+    that link to a span. The span is a collection of events that are related to
+    a single task. The span is identified by a unique span_id.
+    While events are independent, there is some hierarchy.
+    For example, in query_engine.query() call with a reranker attached:
+    - QueryStartEvent
+    - RetrievalStartEvent
+    - EmbeddingStartEvent
+    - EmbeddingEndEvent
+    - RetrievalEndEvent
+    - RerankStartEvent
+    - RerankEndEvent
+    - SynthesizeStartEvent
+    - GetResponseStartEvent
+    - LLMPredictStartEvent
+    - LLMChatStartEvent
+    - LLMChatEndEvent
+    - LLMPredictEndEvent
+    - GetResponseEndEvent
+    - SynthesizeEndEvent
+    - QueryEndEvent
+    """
+    events: ClassVar[list[BaseEvent]] = []
+    @classmethod
+    def class_name(cls) -> str:
+        """Class name."""
+        return "RAGEventHandler"
+    def _print_event_nodes(self, event_nodes: Sequence[NodeWithScore | BaseNode]) -> str:
+        """Print a list of nodes nicely."""
+        output_str = "["
+        for node in event_nodes:
+            output_str += (str(display_source_node(node, 1000)) + "\n")
+            output_str += "* * * * * * * * * * * *"
+        output_str += "]"
+        return (output_str)
+    def handle(self, event: BaseEvent, **kwargs: Any) -> None:
+        """Logic for handling event."""
+        logger.info("-----------------------")
+        # all events have these attributes
+        logger.info(event.id_)
+        logger.info(event.timestamp)
+        logger.info(event.span_id)
+        # event specific attributes
+        logger.info(f"Event type: {event.class_name()}")
+        if isinstance(event, AgentRunStepStartEvent):
+            # logger.info(event.task_id)
+            logger.info(event.step)
+            logger.info(event.input)
+        if isinstance(event, AgentRunStepEndEvent):
+            logger.info(event.step_output)
+        if isinstance(event, AgentChatWithStepStartEvent):
+            logger.info(event.user_msg)
+        if isinstance(event, AgentChatWithStepEndEvent):
+            logger.info(event.response)
+        if isinstance(event, AgentToolCallEvent):
+            logger.info(event.arguments)
+            logger.info(event.tool.name)
+            logger.info(event.tool.description)
+        if isinstance(event, StreamChatDeltaReceivedEvent):
+            logger.info(event.delta)
+        if isinstance(event, StreamChatErrorEvent):
+            logger.info(event.exception)
+        if isinstance(event, EmbeddingStartEvent):
+            logger.info(event.model_dict)
+        if isinstance(event, EmbeddingEndEvent):
+            logger.info(event.chunks)
+            logger.info(event.embeddings[0][:5])  # avoid printing all embeddings
+        if isinstance(event, LLMPredictStartEvent):
+            logger.info(event.template)
+            logger.info(event.template_args)
+        if isinstance(event, LLMPredictEndEvent):
+            logger.info(event.output)
+        if isinstance(event, LLMStructuredPredictStartEvent):
+            logger.info(event.template)
+            logger.info(event.template_args)
+            logger.info(event.output_cls)
+        if isinstance(event, LLMStructuredPredictEndEvent):
+            logger.info(event.output)
+        if isinstance(event, LLMCompletionStartEvent):
+            logger.info(event.model_dict)
+            logger.info(event.prompt)
+            logger.info(event.additional_kwargs)
+        if isinstance(event, LLMCompletionEndEvent):
+            logger.info(event.response)
+            logger.info(event.prompt)
+        if isinstance(event, LLMChatInProgressEvent):
+            logger.info(event.messages)
+            logger.info(event.response)
+        if isinstance(event, LLMChatStartEvent):
+            logger.info(event.messages)
+            logger.info(event.additional_kwargs)
+            logger.info(event.model_dict)
+        if isinstance(event, LLMChatEndEvent):
+            logger.info(event.messages)
+            logger.info(event.response)
+        if isinstance(event, RetrievalStartEvent):
+            logger.info(event.str_or_query_bundle)
+        if isinstance(event, RetrievalEndEvent):
+            logger.info(event.str_or_query_bundle)
+            # logger.info(event.nodes)
+            logger.info(self._print_event_nodes(event.nodes))
+        if isinstance(event, ReRankStartEvent):
+            logger.info(event.query)
+            # logger.info(event.nodes)
+            for node in event.nodes:
+                logger.info(display_source_node(node))
+            logger.info(event.top_n)
+            logger.info(event.model_name)
+        if isinstance(event, ReRankEndEvent):
+            # logger.info(event.nodes)
+            logger.info(self._print_event_nodes(event.nodes))
+        if isinstance(event, QueryStartEvent):
+            logger.info(event.query)
+        if isinstance(event, QueryEndEvent):
+            logger.info(event.response)
+            logger.info(event.query)
+        if isinstance(event, SpanDropEvent):
+            logger.info(event.err_str)
+        if isinstance(event, SynthesizeStartEvent):
+            logger.info(event.query)
+        if isinstance(event, SynthesizeEndEvent):
+            logger.info(event.response)
+            logger.info(event.query)
+        if isinstance(event, GetResponseStartEvent):
+            logger.info(event.query_str)
+        self.events.append(event)
+        logger.info("-----------------------")
+    def _get_events_by_span(self) -> dict[str, list[BaseEvent]]:
+        events_by_span: dict[str, list[BaseEvent]] = {}
+        for event in self.events:
+            if event.span_id in events_by_span:
+                events_by_span[event.span_id].append(event)
+            elif (event.span_id is not None):
+                events_by_span[event.span_id] = [event]
+        return events_by_span
+    def _get_event_span_trees(self) -> list[Tree]:
+        events_by_span = self._get_events_by_span()
+        trees = []
+        tree = Tree()
+        for span, sorted_events in events_by_span.items():
+            # create root node i.e. span node
+            tree.create_node(
+                tag=f"{span} (SPAN)",
+                identifier=span,
+                parent=None,
+                data=sorted_events[0].timestamp,
+            )
+            for event in sorted_events:
+                tree.create_node(
+                    tag=f"{event.class_name()}: {event.id_}",
+                    identifier=event.id_,
+                    parent=event.span_id,
+                    data=event.timestamp,
+                )
+            trees.append(tree)
+            tree = Tree()
+        return trees
+    def print_event_span_trees(self) -> None:
+        """View trace trees."""
+        trees = self._get_event_span_trees()
+        for tree in trees:
+            logger.info(
+                tree.show(
+                    stdout=False, sorting=True, key=lambda node: node.data
+                )
+            )
+            logger.info("")
+class RAGSpanHandler(BaseSpanHandler[SimpleSpan]):
+    span_dict: dict = {}
+    @classmethod
+    def class_name(cls) -> str:
+        """Class name."""
+        return "ExampleSpanHandler"
+    def new_span(
+        self,
+        id_: str,
+        bound_args: Any,
+        instance: Any | None = None,
+        parent_span_id: str | None = None,
+        **kwargs: Any,
+    ) -> SimpleSpan | None:
+        """Create a span."""
+        # logic for creating a new MyCustomSpan
+        if id_ not in self.span_dict:
+            self.span_dict[id_] = []
+        self.span_dict[id_].append(
+            SimpleSpan(id_=id_, parent_id=parent_span_id)
+        )
+    def prepare_to_exit_span(
+        self,
+        id_: str,
+        bound_args: Any,
+        instance: Any | None = None,
+        result: Any | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        """Logic for preparing to exit a span."""
+        # if id in self.span_dict:
+        #    return self.span_dict[id].pop()
+    def prepare_to_drop_span(
+        self,
+        id_: str,
+        bound_args: Any,
+        instance: Any | None = None,
+        err: BaseException | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        """Logic for preparing to drop a span."""
+        # if id in self.span_dict:
+        #    return self.span_dict[id].pop()
+def get_obs() -> Dispatcher:
+    """Get observability for the RAG pipeline."""
+    dispatcher = get_dispatcher()
+    event_handler = RAGEventHandler()
+    span_handler = RAGSpanHandler()
+    dispatcher.add_event_handler(event_handler)
+    dispatcher.add_span_handler(span_handler)
+    return dispatcher

packages.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+libmagic-dev
+poppler-utils
+tesseract-ocr
+pandoc

parsers.py ADDED Viewed

	@@ -0,0 +1,106 @@

+#####################################################
+### DOCUMENT PROCESSOR [PARSERS]
+#####################################################
+# Jonathan Wang
+# ABOUT:
+# This project creates an app to chat with PDFs.
+# This is the PARSERS.
+# It chunks Raw Text into LlamaIndex nodes
+# E.g., by embedding meaning, by sentence, ...
+#####################################################
+# TODO Board:
+# Add more stuff
+#####################################################
+## IMPORTS
+from __future__ import annotations
+from typing import TYPE_CHECKING, Callable, List, Optional
+from llama_index.core import Settings
+from llama_index.core.node_parser import (
+    SemanticSplitterNodeParser,
+    SentenceWindowNodeParser,
+)
+if TYPE_CHECKING:
+    from llama_index.core.base.embeddings.base import BaseEmbedding
+    from llama_index.core.callbacks import CallbackManager
+    from llama_index.core.node_parser.interface import NodeParser
+# from wtpsplit import SaT
+# Lazy Loading
+#####################################################
+## CODE
+# def sentence_splitter_from_SaT(model: Optional[SaT]) -> Callable[[str], List[str]]:
+#     """Convert a SaT model into a sentence splitter function.
+#     Args:
+#         model (SaT): The Segment Anything model.
+#     Returns:
+#         Callable[[str], List[str]]: The sentence splitting function using the SaT model.
+#     """
+#     model = model or ss.model
+#     if model is None:
+#         raise ValueError("Sentence splitting model is not set.")
+#     def sentence_splitter(text: str) -> List[str]:
+#         segments = model.split(text_or_texts=text)
+#         if isinstance(segments, list):
+#             return segments
+#         else:
+#             return list(segments)  # type: ignore (generator is the other option?)
+#     return (sentence_splitter)
+# @st.cache_resource  # can't cache because embed_model is not hashable.
+def get_parser(
+        embed_model: BaseEmbedding,
+        # sentence_model: Optional[SaT] = None,
+        sentence_splitter: Optional[Callable[[str], List[str]]] = None,
+        callback_manager: Optional[CallbackManager] = None
+    ) -> NodeParser:
+    """Parse RAG document processing (main one)."""
+    # if (sentence_model is not None) and (sentence_splitter is not None):
+        # sentence_splitter = sentence_splitter_from_SaT(sentence_model)
+    return SemanticSplitterNodeParser.from_defaults(
+        embed_model=embed_model,
+        breakpoint_percentile_threshold=95,
+        buffer_size=3,
+        sentence_splitter=sentence_splitter,
+        callback_manager=callback_manager or Settings.callback_manager,
+        include_metadata=True,
+        include_prev_next_rel=True,
+    )
+# @st.cache_resource
+# def get_sentence_parser(splitter_model: Optional[SaT] = None) -> SentenceWindowNodeParser:
+#     """Special sentence-level parser to get the document requested info section."""
+#     if (splitter_model is not None):
+#         sentence_splitter = sentence_splitter_from_SaT(splitter_model)
+#     sentence_parser = SentenceWindowNodeParser.from_defaults(
+#         sentence_splitter=sentence_splitter,
+#         window_size=0,
+#         window_metadata_key="window",
+#         original_text_metadata_key="original_text",
+#     )
+#     return (sentence_parser)
+def get_sentence_parser() -> SentenceWindowNodeParser:
+    """Parse sentences to get the document requested info section."""
+    # if (splitter_model is not None):
+    #     sentence_splitter = sentence_splitter_from_SaT(splitter_model)
+    return SentenceWindowNodeParser.from_defaults(
+        # sentence_splitter=sentence_splitter,
+        window_size=0,
+        window_metadata_key="window",
+        original_text_metadata_key="original_text",
+    )

pdf_reader.py ADDED Viewed

	@@ -0,0 +1,528 @@

+#####################################################
+### DOCUMENT PROCESSOR [PDF READER]
+#####################################################
+# Jonathan Wang
+# ABOUT:
+# This project creates an app to chat with PDFs.
+# This is the PDF READER.
+# It converts a PDF into LlamaIndex nodes
+# using UnstructuredIO.
+#####################################################
+# TODO Board:
+# I don't think the current code is elegent... :(
+# TODO: Replace chunk_by_header with a custom solution replicating bySimilarity
+# https://docs.unstructured.io/api-reference/api-services/chunking#by-similarity-chunking-strategy
+# Some hybrid thing...
+# Come up with a awy to handle summarizing images and tables using MultiModalLLM after the processing into nodes.
+    # TODO: Put this into PDFReaderUtilities? Along with the other functions for stuff like email?
+# Investigate PDFPlumber as a backup/alternative for Unstructured.
+    # `https://github.com/jsvine/pdfplumber`
+    # nevermind, this is essentially pdfminer.six but nicer
+# Chunk hierarchy from https://www.reddit.com/r/LocalLLaMA/comments/1dpb9ow/how_we_chunk_turning_pdfs_into_hierarchical/
+# Investigate document parsing algorithms from https://github.com/BobLd/DocumentLayoutAnalysis?tab=readme-ov-file
+# Investigate document parsing algorithms from https://github.com/Filimoa/open-parse?tab=readme-ov-file
+# Competition:
+    # https://github.com/infiniflow/ragflow
+    # https://github.com/deepdoctection/deepdoctection
+#####################################################
+## IMPORTS
+import os
+import re
+import regex
+from copy import deepcopy
+from abc import ABC, abstractmethod
+from typing import Any, List, Tuple, IO, Optional, Type, Generic, TypeVar
+from llama_index.core.bridge.pydantic import Field
+import numpy as np
+from io import BytesIO
+from base64 import b64encode, b64decode
+from PIL import Image as PILImage
+# from pdf_reader_utils import clean_pdf_chunk, dedupe_title_chunks, combine_listitem_chunks
+# Unstructured Document Parsing
+from unstructured.partition.pdf import partition_pdf
+# from unstructured.cleaners.core import clean_extra_whitespace, group_broken_paragraphs #, clean_ordered_bullets, clean_bullets, clean_dashes
+# from unstructured.chunking.title import chunk_by_title
+# Unstructured Element Types
+from unstructured.documents import elements, email_elements
+from unstructured.partition.utils.constants import PartitionStrategy
+# Llamaindex Nodes
+from llama_index.core.settings import Settings
+from llama_index.core.schema import Document, BaseNode, TextNode, ImageNode, NodeRelationship, RelatedNodeInfo
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.base.embeddings.base import BaseEmbedding
+from llama_index.core.node_parser import NodeParser
+# Parallelism for cleaning chunks
+from joblib import Parallel, delayed
+## Lazy Imports
+# import nltk
+#####################################################
+# Additional padding around the PDF extracted images
+PDF_IMAGE_HORIZONTAL_PADDING = 20
+PDF_IMAGE_VERTICAL_PADDING = 20
+os.environ['EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD'] = str(PDF_IMAGE_HORIZONTAL_PADDING)
+os.environ['EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD'] = str(PDF_IMAGE_VERTICAL_PADDING)
+# class TextReader(BaseReader):
+#     def __init__(self, text: str) -> None:
+#         """Init params."""
+#         self.text = text
+# class ImageReader(BaseReader):
+#     def __init__(self, image: Any) -> None:
+#         """Init params."""
+#         self.image = image
+GenericNode = TypeVar("GenericNode", bound=BaseNode)  # https://mypy.readthedocs.io/en/stable/generics.html
+class UnstructuredPDFReader():
+    # Yes, we could inherit from LlamaIndex BaseReader even though I don't think it's a good idea.
+    # Have you seen the Llamaindex Base Reader? It's silly. """OOP"""
+        # https://docs.llamaindex.ai/en/stable/api_reference/readers/
+    # here I'm basically cargo culting off the (not-very-good) pre-built Llamaindex one.
+        # https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/unstructured/base.py
+    # yes I do want to bind these to the class.
+    # you better not be changing the embedding model or node parser on me across different PDFReaders. that's absurd.
+    # embed_model: BaseEmbedding
+    # _node_parser: NodeParser# = Field(
+    #     description="Node parser to run on each Unstructured Title Chunk",
+    #     default=Settings.node_parser,
+    # )
+    _max_characters: int# = Field(
+    #     description="The maximum number of characters in a node",
+    #     default=8192,
+    # )
+    _new_after_n_chars: int #= Field(
+    #     description="The number of characters after which a new node is created",
+    #     default=1024,
+    # )
+    _overlap_n_chars: int #= Field(
+    #     description="The number of characters to overlap between nodes",
+    #     default=128,
+    # )
+    _overlap: int #= Field(
+    #     description="The number of characters to overlap between nodes",
+    #     default=128,
+    # )
+    _overlap_all: bool #= Field(
+    #     description="Whether to overlap all nodes",
+    #     default=False,
+    # )
+    _multipage_sections: bool #= Field(
+    #     description="Whether to include multipage sections",
+    #     default=False,
+    # )
+    ## TODO: Fix this big ball of primiatives and turn it into a class.
+    def __init__(
+        self,
+        # node_parser: Optional[NodeParser],  # Suggest using a SemanticNodeParser.
+        max_characters: int = 2048,
+        new_after_n_chars: int = 512,
+        overlap_n_chars: int = 128,
+        overlap: int = 128,
+        overlap_all: bool = False,
+        multipage_sections: bool = True,
+        **kwargs: Any
+    ) -> None:
+        # node_parser = node_parser or Settings.node_parser
+        """Init params."""
+        super().__init__(**kwargs)
+        self._max_characters = max_characters
+        self._new_after_n_chars = new_after_n_chars
+        self._overlap_n_chars = overlap_n_chars
+        self._overlap = overlap
+        self._overlap_all = overlap_all
+        self._multipage_sections = multipage_sections
+        # self._node_parser = node_parser or Settings.node_parser  # set node parser to run on each Unstructured Title Chunk
+        # Prerequisites for Unstructured.io to work
+        # import nltk
+        # nltk.data.path = ['./nltk_data']
+        # try:
+        #     if not nltk.data.find("tokenizers/punkt"):
+        #         # nltk.download("punkt")
+        #         print("Can't find punkt.")
+        # except Exception as e:
+        #     # nltk.download("punkt")
+        #     print(e)
+        # try:
+        #     if not nltk.data.find("taggers/averaged_perceptron_tagger"):
+        #         # nltk.download("averaged_perceptron_tagger")
+        #         print("Can't find averaged_perceptron_tagger.")
+        # except Exception as e:
+        #     # nltk.download("averaged_perceptron_tagger")
+        #     print(e)
+    # """DATA LOADING FUNCTIONS"""
+    def _node_rel_prev_next(self, prev_node: GenericNode, next_node: GenericNode) -> Tuple[GenericNode, GenericNode]:
+        """Update pre-next node relationships between two nodes."""
+        prev_node.relationships[NodeRelationship.NEXT] = RelatedNodeInfo(
+            node_id=next_node.node_id,
+            metadata={"filename": next_node.metadata['filename']}
+        )
+        next_node.relationships[NodeRelationship.PREVIOUS] = RelatedNodeInfo(
+            node_id=prev_node.node_id,
+            metadata={"filename": prev_node.metadata['filename']}
+        )
+        return (prev_node, next_node)
+    def _node_rel_parent_child(self, parent_node: GenericNode, child_node: GenericNode) -> Tuple[GenericNode, GenericNode]:
+        """Update parent-child node relationships between two nodes."""
+        parent_node.relationships[NodeRelationship.CHILD] = RelatedNodeInfo(
+            node_id=child_node.node_id,
+            metadata={"filename": child_node.metadata['filename']}
+        )
+        child_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
+            node_id=parent_node.node_id,
+            metadata={"filename": parent_node.metadata['filename']}
+        )
+        return (parent_node, child_node)
+    def _handle_metadata(
+        self,
+        pdf_chunk: elements.Element,
+        node: GenericNode,
+        kept_metadata: List[str] = [
+            'filename', 'file_directory', 'coordinates',
+            'page_number', 'page_name', 'section',
+            'sent_from', 'sent_to', 'subject',
+            'parent_id', 'category_depth',
+            'text_as_html', 'languages',
+            'emphasized_text_contents', 'link_texts', 'link_urls',
+            'is_continuation', 'detection_class_prob',
+    ]) -> GenericNode:
+        """Add common unstructured element metadata to LlamaIndex node."""
+        pdf_chunk_metadata = pdf_chunk.metadata.to_dict() if pdf_chunk.metadata else {}
+        current_kept_metadata = deepcopy(kept_metadata)
+        # Handle some interesting keys
+        node.metadata['type'] = pdf_chunk.category
+        if (('filename' in current_kept_metadata) and ('filename' in pdf_chunk_metadata) and ('file_directory' in pdf_chunk_metadata)):
+            filename = os.path.join(str(pdf_chunk_metadata['file_directory']), str(pdf_chunk_metadata['filename']))
+            node.metadata['filename'] = filename
+            current_kept_metadata.remove('file_directory') if ('file_directory' in current_kept_metadata) else None
+        if (('text_as_html' in current_kept_metadata) and ('text_as_html' in pdf_chunk_metadata)):
+            node.metadata['orignal_table_text'] = getattr(node, 'text', '')
+            node.text = pdf_chunk_metadata['text_as_html']
+            current_kept_metadata.remove('text_as_html')
+        if (('coordinates' in current_kept_metadata) and (pdf_chunk_metadata.get('coordinates') is not None)):
+            node.metadata['coordinates'] = pdf_chunk_metadata['coordinates']
+            current_kept_metadata.remove('coordinates')
+        if (('page_number' in current_kept_metadata) and ('page_number' in pdf_chunk_metadata)):
+            node.metadata['page_number'] = [pdf_chunk_metadata['page_number']]  # save as list to allow for multiple pages
+            current_kept_metadata.remove('page_number')
+        if (('page_name' in current_kept_metadata) and ('page_name' in pdf_chunk_metadata)):
+            node.metadata['page_name'] = [pdf_chunk_metadata['page_name']]  # save as list to allow for multiple sheets
+            current_kept_metadata.remove('page_name')
+        # Handle the remaining keys
+        for key in set(current_kept_metadata).intersection(set(pdf_chunk_metadata.keys())):
+            node.metadata[key] = pdf_chunk_metadata[key]
+        return node
+    def _handle_text_chunk(self, pdf_text_chunk: elements.Element) -> TextNode:
+        """Given a text chunk from Unstructured, convert it to a TextNode for LlamaIndex.
+        Args:
+            pdf_text_chunk (elements.Element): Input text chunk from Unstructured.
+        Returns:
+            TextNode: LlamaIndex TextNode which saves the text as HTML for structure.
+        """
+        new_node = TextNode(
+            text=pdf_text_chunk.text,
+            id_=pdf_text_chunk.id,
+            excluded_llm_metadata_keys=['type', 'parent_id', 'depth', 'filename', 'coordinates', 'link_texts', 'link_urls', 'link_start_indexes', 'orig_nodes', 'orignal_table_text', 'languages', 'detection_class_prob', 'keyword_metadata'],
+            excluded_embed_metadata_keys=['type', 'parent_id', 'depth', 'filename', 'coordinates', 'page number', 'original_text', 'window', 'link_texts', 'link_urls', 'link_start_indexes', 'orig_nodes', 'orignal_table_text', 'languages', 'detection_class_prob']
+        )
+        new_node = self._handle_metadata(pdf_text_chunk, new_node)
+        return (new_node)
+    def _handle_table_chunk(self, pdf_table_chunk: elements.Table | elements.TableChunk) -> TextNode:
+        """Given a table chunk from Unstructured, convert it to a TextNode for LlamaIndex.
+        Args:
+            pdf_table_chunk (elements.Table | elements.TableChunk): Input table chunk from Unstructured
+        Returns:
+            TextNode: LlamaIndex TextNode which saves the table as HTML for structure.
+        NOTE: You will need to get the summary of the table for better performance.
+        """
+        new_node = TextNode(
+            text=pdf_table_chunk.metadata.text_as_html if pdf_table_chunk.metadata.text_as_html else pdf_table_chunk.text,
+            id_=pdf_table_chunk.id,
+            excluded_llm_metadata_keys=['type', 'parent_id', 'depth', 'filename', 'coordinates', 'link_texts', 'link_urls', 'link_start_indexes', 'orig_nodes', 'orignal_table_text', 'languages', 'detection_class_prob', 'keyword_metadata'],
+            excluded_embed_metadata_keys=['type', 'parent_id', 'depth', 'filename', 'coordinates', 'page number', 'original_text', 'window', 'link_texts', 'link_urls', 'link_start_indexes', 'orig_nodes', 'orignal_table_text', 'languages', 'detection_class_prob']
+        )
+        new_node = self._handle_metadata(pdf_table_chunk, new_node)
+        return (new_node)
+    def _handle_image_chunk(self, pdf_image_chunk: elements.Element) -> ImageNode:
+        """Given an image chunk from UnstructuredIO, read it in and convert it into a Llamaindex ImageNode.
+        Args:
+            pdf_image_chunk (elements.Element): The input image element from UnstructuredIO. We'll allow all types, just in case you want to process some weird chunks.
+        Returns:
+            ImageNode: The image saved as a Llamaindex ImageNode.
+        """
+        pdf_image_chunk_data_available = pdf_image_chunk.metadata.to_dict()
+        # Check for either saved image_path or image_base64/image_mime_type
+        if (('image_path' not in pdf_image_chunk_data_available) and ('image_base64' not in pdf_image_chunk_data_available)):
+            raise Exception('Image chunk does not have either image_path or image_base64/image_mime_type. Are you sure this is an image?')
+        # Make the image node.
+        new_node = ImageNode(
+            text=pdf_image_chunk.text,
+            id_=pdf_image_chunk.id,
+            excluded_llm_metadata_keys=['type', 'parent_id', 'depth', 'filename', 'coordinates', 'link_texts', 'link_urls', 'link_start_indexes', 'orig_nodes', 'languages', 'detection_class_prob', 'keyword_metadata'],
+            excluded_embed_metadata_keys=['type', 'parent_id', 'depth', 'filename', 'coordinates', 'page number', 'original_text', 'window', 'link_texts', 'link_urls', 'link_start_indexes', 'orig_nodes', 'languages', 'detection_class_prob']
+        )
+        new_node = self._handle_metadata(pdf_image_chunk, new_node)
+        # Add image data to image node
+        image = None
+        if ('image_path' in pdf_image_chunk_data_available):
+            # Save image path to image node
+            new_node.image_path = pdf_image_chunk_data_available['image_path']
+            # Load image from path, convert to base64
+            image_pil = PILImage.open(pdf_image_chunk_data_available['image_path'])
+            image_buffer = BytesIO()
+            image_pil.save(image_buffer, format='JPEG')
+            image = b64encode(image_buffer.getvalue()).decode('utf-8')
+            new_node.image = image
+            new_node.image_mimetype = 'image/jpeg'
+            del image_buffer, image_pil
+        elif ('image_base64' in pdf_image_chunk_data_available):
+            # Save image base64 to image node
+            new_node.image = pdf_image_chunk_data_available['image_base64']
+            new_node.image_mimetype = pdf_image_chunk_data_available['image_mime_type']
+        return (new_node)
+    def _handle_composite_chunk(self, pdf_composite_chunk: elements.CompositeElement) -> BaseNode:
+        """Given a composite chunk from Unstructured, convert it into a node and handle it dependencies as well."""
+        # Start by getting a list of all the nodes which were combined into the composite chunk.
+        # child_chunks = pdf_composite_chunk.metadata.to_dict()['orig_elements']
+        child_chunks = pdf_composite_chunk.metadata.orig_elements or []
+        child_nodes = []
+        for chunk in child_chunks:
+            child_nodes.append(self._handle_chunk(chunk))  # process all the child chunks.
+        # Then build the Composite Chunk into a Node.
+        composite_node = self._handle_text_chunk(pdf_text_chunk=pdf_composite_chunk)
+        composite_node = self._handle_metadata(pdf_composite_chunk, composite_node)
+        # Set relationships between chunks.
+        for index in range(1, len(child_nodes)):
+            child_nodes[index-1], child_nodes[index] = self._node_rel_prev_next(child_nodes[index-1], child_nodes[index])
+        for index, node in enumerate(child_nodes):
+            composite_node, child_nodes[index] = self._node_rel_parent_child(composite_node, child_nodes[index])
+        composite_node.metadata['orig_nodes'] = child_nodes
+        composite_node.excluded_llm_metadata_keys = ['filename', 'coordinates', 'chunk_number', 'window', 'orig_nodes', 'languages', 'detection_class_prob', 'keyword_metadata']
+        composite_node.excluded_embed_metadata_keys = ['filename', 'coordinates', 'chunk_number', 'page number', 'original_text', 'window', 'summary', 'orig_nodes', 'languages', 'detection_class_prob']
+        return(composite_node)
+    def _handle_chunk(self, chunk: elements.Element) -> BaseNode:
+        """Convert Unstructured element chunks to Llamaindex Node. Determine which chunk handling to use based on the element type."""
+        # Composite (multiple nodes combined together by chunking)
+        if (isinstance(chunk, elements.CompositeElement)):
+            return (self._handle_composite_chunk(pdf_composite_chunk=chunk))
+        # Tables
+        elif ((chunk.category == 'Table') and isinstance(chunk, (elements.Table, elements.TableChunk))):
+            return(self._handle_table_chunk(pdf_table_chunk=chunk))
+        # Images
+        elif (any(True for chunk_info in ['image', 'image_base64', 'image_path'] if chunk_info in chunk.metadata.to_dict())):
+            return(self._handle_image_chunk(pdf_image_chunk=chunk))
+        # Text
+        else:
+            return(self._handle_text_chunk(pdf_text_chunk=chunk))
+    def pdf_to_chunks(
+        self,
+        file_path: Optional[str],
+        file: Optional[IO[bytes]],
+    ) -> List[elements.Element]:
+        """
+        Given the file path to a PDF, read it in with UnstructuredIO and return its elements.
+        """
+        print("NEWPDF: Partitioning into Chunks...")
+        # 1. attempt using AUTO to have it decide.
+        # NOTE: this takes care of pdfminer, and also choses between using detectron2 vs tesseract only.
+        # However, it sometimes gets confused by PDFs where text elements are added on later, e.g., CIDs for linking, or REDACTED
+        pdf_chunks = partition_pdf(
+            filename=file_path,
+            file=file,
+            unique_element_ids=True,  # UUIDs that are unique for each element
+            strategy=PartitionStrategy.HI_RES,  # auto: it decides, hi_res: detectron2, but issues with multi-column, ocr_only: pytesseract, fast: pdfminer
+            hi_res_model_name='yolox',
+            include_page_breaks=False,
+            metadata_filename=file_path,
+            infer_table_structure=True,
+            extract_images_in_pdf=True,
+            extract_image_block_types=['Image', 'Table', 'Formula'],  # element types to save as images
+            extract_image_block_to_payload=False,  # needs to be false; we'll convert into base64 later.
+            extract_forms=False,  # not currently available
+            extract_image_block_output_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data/pdfimgs/')
+        )
+        # # 2. Check if it got good output.
+        # pdf_read_in_okay = self.check_pdf_read_in(pdf_file_path=pdf_file_path, pdf_file=pdf_file, pdf_chunks=pdf_chunks)
+        # if (pdf_read_in_okay):
+        #     return pdf_chunks
+        # # 3. Okay, PDF didn't read in well, so we'll use the back-up strategy
+        # # According to Unstructured's Github: https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/pdf.py
+        # # that is "OCR_ONLY" as opposed to "HI_RES".
+        # pdf_chunks = partition_pdf(
+        #     filename=pdf_file_path,
+        #     file=pdf_file,
+        #     strategy="ocr_only"  # auto: it decides, hi_res: detectron2, but issues with multi-column, ocr_only: pytesseract, fast: pdfminer
+        # )
+        return pdf_chunks
+    def chunks_to_nodes(self, pdf_chunks: List[elements.Element]) -> List[BaseNode]:
+        """
+        Given a PDF from Unstructured broken by header,
+        convert them into nodes using the node_parser.
+        E.g., to have all sentences with similar meaning as a node, use the SemanticNodeParser
+        """
+        # 0. Setup.
+        unstructured_chunk_nodes = []
+        # Hash of node ID and index
+        node_id_to_index = {}
+        # 1. Convert each page's text to Nodes.
+        for index, chunk in enumerate(pdf_chunks):
+            # Create new node based on node type
+            new_node = self._handle_chunk(chunk)
+            # Update hash of node ID and index
+            node_id_to_index[new_node.id_] = index
+            # Add relationship to prior node
+            if (len(unstructured_chunk_nodes) > 0):
+                unstructured_chunk_nodes[-1], new_node = self._node_rel_prev_next(prev_node=unstructured_chunk_nodes[-1], next_node=new_node)
+            # Add parent-child relationships for Title Chunks
+            if (chunk.metadata.parent_id is not None):
+                # Find the index of the parent node based on parent_id
+                parent_index = node_id_to_index[chunk.metadata.parent_id]
+                if (parent_index is not None):
+                    unstructured_chunk_nodes[parent_index], new_node = self._node_rel_parent_child(parent_node=unstructured_chunk_nodes[parent_index], child_node=new_node)
+            # Append to list
+            unstructured_chunk_nodes.append(new_node)
+        del node_id_to_index
+        ## TODO: Move this chunk into a separate ReaderPostProcessor thing into PDFReaderUtils. Bundle in the sumamrization for tables and images into this.
+        # 2. Node Parse each page to split when new information is different
+        # NOTE: This was built for the Semantic Parser, but I guess we'll technically allow any parser here.
+        # unstructured_parsed_nodes = self._node_parser.get_nodes_from_documents(unstructured_chunk_nodes)
+        # 3. Node Attributes
+        # for index, node in enumerate(unstructured_parsed_nodes):
+        #     # Keywords and Summary
+        #     # node_keywords = ', '.join(pdfrutils.get_keywords(node.text, top_k=5))
+        #     # node_summary = get_t5_summary(node.text, summary_length=64)  # get_t5_summary
+        #     node.metadata['keywords'] = node_keywords
+        #     # node.metadata['summary'] = node_summary + (("\n" + node.metadata['summary']) if node.metadata['summary'] is not None else "")
+        #     # Get additional information about the node.
+        #     # Email: check for address.
+        #     info_types = []
+        #     if (pdfrutils.has_date(node.text)):
+        #         info_types.append("date")
+        #     if (pdfrutils.has_email(node.text)):
+        #         info_types.append("contact email")
+        #     if (pdfrutils.has_mail_addr(node.text)):
+        #         info_types.append("mailing postal address")
+        #     if (pdfrutils.has_phone(node.text)):
+        #         info_types.append("contact phone")
+        #     node.metadata['information types'] = ", ".join(info_types)
+            # node.excluded_llm_metadata_keys = ['filename', 'coordinates', 'chunk_number', 'window', 'orig_nodes']
+            # node.excluded_embed_metadata_keys = ['filename', 'coordinates', 'chunk_number', 'page number', 'original_text', 'window', 'keywords', 'summary', 'orig_nodes']
+            # if (index > 0):
+                # unstructured_parsed_nodes[index-1], node = self._node_rel_prev_next(unstructured_parsed_nodes[index-1], node)
+        return(unstructured_chunk_nodes)
+    # """Main user-interaction function"""
+    def load_data(
+        self,
+        file_path: Optional[str] = None,
+        file: Optional[IO[bytes]] = None
+    ) -> List: #[GenericNode]:
+        """Given a path to a PDF file, load it with Unstructured and convert it into a list of Llamaindex Base Nodes.
+        Input:
+            - pdf_file_path (str): the path to the PDF file.
+        Output:
+            - List[GenericNode]: a list of LlamaIndex nodes. Creates one node for each parsed node, for each Unstructured Title Chunk.
+        """
+        # 1. PDF to Chunks
+        print("NEWPDF: Reading Input File...")
+        pdf_chunks = self.pdf_to_chunks(file_path=file_path, file=file)
+        # return (pdf_chunks)
+        # Chunk processing
+        # pdf_chunks = clean_pdf_chunk, dedupe_title_chunks, combine_listitem_chunks, remove_header_footer_pagenum
+        # 2. Chunks to titles
+        # TODO: I hate this, make our own chunker.
+        # pdf_titlechunks = chunk_by_title(
+        #     pdf_chunks,
+        #     max_characters=self._max_characters,
+        #     new_after_n_chars=self._new_after_n_chars,
+        #     overlap=self._overlap,
+        #     overlap_all=self._overlap_all,
+        #     multipage_sections=self._multipage_sections,
+        #     include_orig_elements=True,
+        #     combine_text_under_n_chars=self._new_after_n_chars
+        # )
+        # 3. Cleaning
+        # pdf_titlechunks = Parallel(n_jobs=max(int(os.cpu_count())-1, 1))(  # type: ignore
+        #     delayed(self.clean_pdf_chunk)(chunk) for chunk in pdf_chunks # pdf_titlechunks
+        # )
+        # pdf_titlechunks = list(pdf_titlechunks)
+        # 4. Headlines to llamaindex nodes
+        print("NEWPDF: Converting chunks to nodes...")
+        parsed_chunks = self.chunks_to_nodes(pdf_chunks)
+        return (parsed_chunks)

pdf_reader_utils.py ADDED Viewed

	@@ -0,0 +1,592 @@

+#####################################################
+### DOCUMENT PROCESSOR [PDF READER UTILITIES]
+#####################################################
+# Jonathan Wang
+# ABOUT:
+# This project creates an app to chat with PDFs.
+# This is the PDF READER UTILITIES.
+# It defines helper functions for the PDF reader,
+# such as getting Keywords or finding Contact Info.
+#####################################################
+### TODO Board:
+# Better Summarizer than T5, which has been stripped out?
+# Better keywords than the RAKE+YAKE fusion we're currently using?
+# Consider using GPE/GSP tagging with spacy to confirm mailing addresses?
+# Handle FigureCaption somehow.
+# Skip Header if it has a Page X or other page number construction.
+# Detect images that are substantially overlapping according to coordinates.
+# https://stackoverflow.com/questions/49897531/detect-overlapping-images-in-pil
+# Keep them in the following order: no confidence score, larger image, higher confidence score
+# Detect nodes whose text is substantially repeated at either the top or bottom of the page.
+# Utilize the coordinates to ignore the text on the top and bottom two lines.
+# Fix OCR issues with spell checking?
+# Remove images that are too small in size, and overlapping with text boxes.
+# Convert the List[BaseNode] -> List[BaseNode] functions into TransformComponents
+#####################################################
+### Imports
+from __future__ import annotations
+import difflib
+import re
+from collections import defaultdict
+from copy import deepcopy
+from typing import (
+    TYPE_CHECKING,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+)
+import rapidfuzz
+import regex
+from llama_index.core.schema import (
+    BaseNode,
+    NodeRelationship,
+    RelatedNodeInfo,
+)
+if TYPE_CHECKING:
+    from unstructured.documents import elements
+#####################################################
+### CODE
+GenericNode = TypeVar("GenericNode", bound=BaseNode)
+def clean_pdf_chunk(pdf_chunk: elements.Element) -> elements.Element:
+    """Given a single element of text from a pdf read by Unstructured, clean its text."""
+    ### NOTE: Don't think it's work making this a separate TransformComponent.
+    # We'd still need to clean bad characters from the reader.
+    chunk_text = pdf_chunk.text
+    if (len(chunk_text) > 0):
+        # Clean any control characters which break the language detection for other parts of the reader.
+        re_bad_chars = regex.compile(r"[\p{Cc}\p{Cs}]+")
+        chunk_text = re_bad_chars.sub("", chunk_text)
+        # Remove PDF citations text
+        chunk_text = re.sub("\\(cid:\\d+\\)", "", chunk_text)  # matches (cid:###)
+        # Clean whitespace and broken paragraphs
+        # chunk_text = clean_extra_whitespace(chunk_text)
+        # chunk_text = group_broken_paragraphs(chunk_text)
+        # Save cleaned text.
+        pdf_chunk.text = chunk_text
+    return pdf_chunk
+def clean_abbreviations(pdf_chunks: list[GenericNode]) -> list[GenericNode]:
+    """Remove any common abbreviations in the text which can confuse the sentence model.
+    Args:
+        pdf_chunks (List[GenericNode]): List of llama-index nodes.
+    Returns:
+        List[GenericNode]: The nodes with cleaned text, abbreviations replaced.
+    """
+    for pdf_chunk in pdf_chunks:
+        text = getattr(pdf_chunk, "text", "")
+        if (text == ""):
+            continue
+        # No. -> Number
+        text = re.sub(r"\bNo\b\.\s", "Number", text, flags=re.IGNORECASE)
+        # Fig. -> Figure
+        text = re.sub(r"\bFig\b\.", "Figure", text, flags=re.IGNORECASE)
+        # Eq. -> Equation
+        text = re.sub(r"\bEq\b\.", "Equation", text, flags=re.IGNORECASE)
+        # Mr. -> Mr
+        text = re.sub(r"\bMr\b\.", "Mr", text, flags=re.IGNORECASE)
+        # Mrs. -> Mrs
+        text = re.sub(r"\bMrs\b\.", "Mrs", text, flags=re.IGNORECASE)
+        # Dr. -> Dr
+        text = re.sub(r"\bDr\b\.", "Dr", text, flags=re.IGNORECASE)
+        # Jr. -> Jr
+        text = re.sub(r"\bJr\b\.", "Jr", text, flags=re.IGNORECASE)
+        # etc. -> etc
+        text = re.sub(r"\betc\b\.", "etc", text, flags=re.IGNORECASE)
+        pdf_chunk.text = text
+    return pdf_chunks
+def _remove_chunk(
+    pdf_chunks: list[GenericNode],
+    chunk_index: int | None=None,
+    chunk_id: str | None=None
+) -> list[GenericNode]:
+    """Given a list of chunks, remove the chunk at the given index or with the given id.
+    Args:
+        pdf_chunks (List[GenericNode]): The list of chunks.
+        chunk_index (Optional[int]): The index of the chunk to remove.
+        chunk_id (Optional[str]): The id of the chunk to remove.
+    Returns:
+        List[GenericNode]: The updated list of chunks, without the removed chunk.
+    """
+    if (chunk_index is None and chunk_id is None):
+        msg = "_remove_chunk: Either chunk_index or chunk_id must be set."
+        raise ValueError(msg)
+    # Convert chunk_id to chunk_index
+    elif (chunk_index is None):
+        chunk = next((c for c in pdf_chunks if c.node_id == chunk_id), None)
+        if chunk is not None:
+            chunk_index = pdf_chunks.index(chunk)
+        else:
+            msg = f"_remove_chunk: No chunk found with id {chunk_id}."
+            raise ValueError(msg)
+    elif (chunk_index < 0 or chunk_index >= len(pdf_chunks)):
+        msg = f"_remove_chunk: Chunk {chunk_index} is out of range. Maximum index is {len(pdf_chunks) - 1}."
+        raise ValueError(msg)
+    # Update the previous-next node relationships around that index
+    def _node_rel_prev_next(prev_node: GenericNode, next_node: GenericNode) -> tuple[GenericNode, GenericNode]:
+        """Update pre-next node relationships between two nodes."""
+        prev_node.relationships[NodeRelationship.NEXT] = RelatedNodeInfo(
+            node_id=next_node.node_id,
+            metadata={"filename": next_node.metadata["filename"]}
+        )
+        next_node.relationships[NodeRelationship.PREVIOUS] = RelatedNodeInfo(
+            node_id=prev_node.node_id,
+            metadata={"filename": prev_node.metadata["filename"]}
+        )
+        return (prev_node, next_node)
+    if (chunk_index > 0 and chunk_index < len(pdf_chunks) - 1):
+        pdf_chunks[chunk_index - 1], pdf_chunks[chunk_index + 1] = _node_rel_prev_next(prev_node=pdf_chunks[chunk_index - 1], next_node=pdf_chunks[chunk_index + 1])
+    popped_chunk = pdf_chunks.pop(chunk_index)
+    chunk_id = chunk_id or popped_chunk.node_id
+    # Remove any references to the removed chunk in node relationships or metadata
+    for node in pdf_chunks:
+        node.relationships = {k: v for k, v in node.relationships.items() if v.node_id != chunk_id}
+        node.metadata = {k: v for k, v in node.metadata.items() if ((isinstance(v, list) and (chunk_id in v)) or (v != chunk_id))}
+    return pdf_chunks
+def _clean_overlap_text(
+    text1: str,
+    text2: str,
+    combining_text: str=" ",
+    min_length: int | None = 1,
+    max_length: int | None = 50,
+    overlap_threshold: float = 0.9
+) -> str:
+    r"""Remove any overlapping text between two strings.
+    Args:
+        text1 (str): The first string.
+        text2 (str): The second string.
+        combining_text (str, optional): The text to combine the two strings with. Defaults to space (' '). Can also be \n.
+        min_length (int, optional): The minimum length of the overlap. Defaults to 1. None is no minimum.
+        max_length (int, optional): The maximum length of the overlap. Defaults to 50. None is no maximum.
+        overlap_threshold (float, optional): The threshold for being an overlap. Defaults to 0.8.
+    Returns:
+        str: The strings combined with the overlap removed.
+    """
+    for overlap_len in range(min(len(text1), len(text2), (max_length or len(text1))), ((min_length or 1)-1), -1):
+        end_substring = text1[-overlap_len:]
+        start_substring = text2[:overlap_len]
+        similarity = difflib.SequenceMatcher(None, end_substring, start_substring).ratio()
+        if (similarity >= overlap_threshold):
+            return combining_text.join([text1[:-overlap_len], text2[overlap_len:]]).strip()
+    return combining_text.join([text1, text2]).strip()
+def _combine_chunks(c1: GenericNode, c2: GenericNode) -> GenericNode:
+    """Combine two chunks into one.
+    Args:
+        c1 (GenericNode): The first chunk.
+        c2 (GenericNode): The second chunk.
+    Returns:
+        GenericNode: The combined chunk.
+    """
+    # Metadata merging
+    # Type merging
+    text_types = ["NarrativeText", "ListItem", "Formula", "UncategorizedText", "Composite-TextOnly"]
+    image_types = ["FigureCaption", "Image"]  # things that make Image nodes.
+    def _combine_chunks_type(c1_type: str, c2_type: str) -> str:
+        """Combine the types of two chunks.
+        Args:
+            c1_type (str): The type of the first chunk.
+            c2_type (str): The type of the second chunk.
+        Returns:
+            str: The type of the combined chunk.
+        """
+        if (c1_type == c2_type):
+            return c1_type
+        elif (c1_type in text_types and c2_type in text_types):
+            return "Composite-TextOnly"
+        elif (c1_type in image_types and c2_type in image_types):
+            return "Image"   # Add caption to image
+        else:
+            return "Composite"
+    c1_type = c1.metadata["type"]
+    c2_type = c2.metadata["type"]
+    c1.metadata["type"] = _combine_chunks_type(c1_type, c2_type)
+    # All other metadata merging
+    for k, v in c2.metadata.items():
+        if k not in c1.metadata:
+            c1.metadata[k] = v
+        # Merge lists
+        elif k in ["page_number", 'page_name', 'languages', 'emphasized_text_contents', 'link_texts', 'link_urls']:
+            if not isinstance(c1.metadata[k], list):
+                c1.metadata[k] = list(c1.metadata[k])
+            if (v not in c1.metadata[k]):
+                # Add to list, dedupe
+                c1.metadata[k].extend(v)
+                c1.metadata[k] = sorted(set(c1.metadata[k]))
+    # Text merging
+    c1_text = getattr(c1, "text", "")
+    c2_text = getattr(c2, "text", "")
+    if (c1_text == c2_text):
+        # No duplicates.
+        return c1
+    if (c1_text == "" or c2_text == ""):
+        c1.text = c1_text + c2_text
+        return c1
+    # Check if a sentence has been split between two chunks
+    # Option 1: letters
+    c1_text_last = c1_text[-1]
+    # Check if c1_text_last has a lowercase letter, digit, or punctuation that doesn't end a sentence
+    if (re.search(r'[\da-z\[\]\(\)\{\}\<\>\%\^\&\"\'\:\;\,\/\-\_\+\= \t\n\r]', c1_text_last)):
+        # We can probably combine these two texts as if they were on the same line.
+        c1.text = _clean_overlap_text(c1_text, c2_text, combining_text=" ")
+    else:
+        # We'll treat these as if they were on separate lines.
+        c1.text = _clean_overlap_text(c1_text, c2_text, combining_text="\n")
+    # NOTE: Relationships merging is handled in other functions, because it requires looking back at prior prior chunks.
+    return c1
+def dedupe_title_chunks(pdf_chunks: list[GenericNode]) -> list[GenericNode]:
+    """Given a list of chunks, return a list of chunks without any title duplicates.
+    Args:
+        pdf_chunks (List[BaseNode]): The list of chunks to have titles deduped.
+    Returns:
+        List[BaseNode]: The deduped list of chunks.
+    """
+    index = 0
+    while (index < len(pdf_chunks)):
+        if (
+            (pdf_chunks[index].metadata["type"] in ("Title")) # is title
+            and (index > 0) # is not first chunk
+            and (pdf_chunks[index - 1].metadata["type"] in ("Title"))  # previous chunk is also title
+        ):
+            # if (getattr(pdf_chunks[index], 'text', None) != getattr(pdf_chunks[index - 1], 'text', '')):
+                # pdf_chunks[index].text = getattr(pdf_chunks[index - 1], 'text', '') + '\n' + getattr(pdf_chunks[index], 'text', '')
+            pdf_chunks[index] = _combine_chunks(pdf_chunks[index - 1], pdf_chunks[index])
+            # NOTE: We'll remove the PRIOR title, since duplicates AND child relationships are built on the CURRENT title.
+            # There shouldn't be any PARENT/CHILD relationships to the title that we are deleting, so this seems fine.
+            pdf_chunks = _remove_chunk(pdf_chunks=pdf_chunks, chunk_index=index-1)
+            # NOTE: don't need to shift index because we removed an element.
+        else:
+            # We don't care about any situations other than consecutive title chunks.
+            index += 1
+    return (pdf_chunks)
+def combine_listitem_chunks(pdf_chunks: list[GenericNode]) -> list[GenericNode]:
+    """Given a list of chunks, combine any adjacent chunks which are ListItems into one List.
+    Args:
+        pdf_chunks (List[GenericNode]): The list of chunks to combine.
+    Returns:
+        List[GenericNode]: The list of chunks with ListItems combined into one List chunk.
+    """
+    index = 0
+    while (index < len(pdf_chunks)):
+        if (
+            (pdf_chunks[index].metadata["type"] == "ListItem") # is list item
+            and (index > 0) # is not first chunk
+            and (pdf_chunks[index - 1].metadata["type"] == "ListItem")  # previous chunk is also list item
+        ):
+            # Okay, we have a consecutive list item. Combine into one list.
+            # NOTE: We'll remove the PRIOR list item, since duplicates AND child relationships are built on the CURRENT list item.
+            # 1. Append prior list item's text to the current list item's text
+            # pdf_chunks[index].text = getattr(pdf_chunks[index - 1], 'text', '') + '\n' + getattr(pdf_chunks[index], 'text', '')
+            pdf_chunks[index] = _combine_chunks(pdf_chunks[index - 1], pdf_chunks[index])
+            # 2. Remove PRIOR list item
+            pdf_chunks.pop(index - 1)
+            # 3. Replace NEXT relationship from PRIOR list item with the later list item node ID, if prior prior node exists.
+            if (index - 2 >= 0):
+                pdf_chunks[index - 2].relationships[NodeRelationship.NEXT] = RelatedNodeInfo(
+                    node_id=pdf_chunks[index].node_id,
+                    metadata={"filename": pdf_chunks[index].metadata["filename"]}
+                )
+            # 4. Replace PREVIOUS relationship from LATER list item with the prior prior node ID, if prior prior node exists.
+                pdf_chunks[index].relationships[NodeRelationship.PREVIOUS] = RelatedNodeInfo(
+                    node_id=pdf_chunks[index - 2].node_id,
+                    metadata={"filename": pdf_chunks[index - 2].metadata['filename']}
+                )
+            # NOTE: the PARENT/CHILD relationships should be the same as the previous list item, so this seems fine.
+        else:
+            # We don't care about any situations other than consecutive list item chunks.
+            index += 1
+    return (pdf_chunks)
+def remove_header_footer_repeated(
+    pdf_chunks_input: list[GenericNode],
+    window_size: int = 3,
+    fuzz_threshold: int = 80
+) -> list[GenericNode]:
+    """Given a list of chunks, remove any header/footer chunks that are repeated across pages.
+    Args:
+        pdf_chunks (List[GenericNode]): The list of chunks to process.
+        window_size (int): The number of chunks to consider at the beginning and end of each page.
+        fuzz_threshold (int): The threshold for fuzzy matching of chunk texts.
+    Returns:
+        List[GenericNode]: The list of chunks with header/footer chunks removed.
+    """
+    nodes_to_remove = set()   # id's to remove.
+    pdf_chunks = deepcopy(pdf_chunks_input)
+    # Build a dictionary of chunks by page number
+    chunks_by_page = defaultdict(list)
+    for chunk in pdf_chunks:
+        chunk_page_number = min(chunk.metadata["page_number"]) if isinstance(chunk.metadata["page_number"], list) else chunk.metadata["page_number"]
+        chunks_by_page[chunk_page_number].append(chunk)
+    # Get the first window_size and last window_size chunks on each page
+    header_candidates = defaultdict(set)  # hashmap of chunk text, and set of chunk ids with that text.
+    footer_candidates = defaultdict(set)  # hashmap of chunk text, and set of chunk ids with that text.
+    page_number_regex = re.compile(r"(?:-|\( ?)?\b(?:page|p\.?(?:[pg](?:\b|\.)?)?)? ?(?:\d+|\b[ivxm]+\b)\.?(?: ?-|\))?\b", re.IGNORECASE)
+    for chunks in chunks_by_page.values():
+        header_chunks = chunks[:window_size]
+        footer_chunks = chunks[-window_size:]
+        for chunk in header_chunks:
+            chunk_text = getattr(chunk, "text", "")
+            if chunk.metadata["type"] == "Header" and len(chunk_text) > 0:
+                chunk_text_is_pagenum_only = page_number_regex.match(chunk_text)
+                if chunk_text_is_pagenum_only and (len(chunk_text_is_pagenum_only.group(0)) == len(chunk_text)):
+                    # Full match!
+                    chunk.text = "Page Number Only"
+                    nodes_to_remove.add(chunk.node_id)
+                elif chunk_text_is_pagenum_only and len(chunk_text_is_pagenum_only.group(0)) > 0:
+                    # Remove the page number content from the chunk text for this exercise
+                    chunk_text = page_number_regex.sub('', chunk_text)
+                    chunk.text = chunk_text
+            if chunk.metadata["type"] not in ("Image", "Table") and len(chunk_text) > 0:
+                header_candidates[chunk_text].add(chunk.node_id)
+        for chunk in footer_chunks:
+            chunk_text = getattr(chunk, "text", "")
+            if chunk.metadata["type"] == "Footer" and len(chunk_text) > 0:
+                chunk_text_is_pagenum_only = page_number_regex.match(chunk_text)
+                if chunk_text_is_pagenum_only and (len(chunk_text_is_pagenum_only.group(0)) == len(chunk_text)):
+                    # Full match!
+                    chunk.text = "Page Number Only"
+                    nodes_to_remove.add(chunk.node_id)
+                elif chunk_text_is_pagenum_only and len(chunk_text_is_pagenum_only.group(0)) > 0:
+                    # Remove the page number content from the chunk text for this exercise
+                    chunk_text = page_number_regex.sub('', chunk_text)
+                    chunk.text = chunk_text
+            if chunk.metadata["type"] not in ("Image", "Table") and len(chunk_text) > 0:
+                footer_candidates[chunk_text].add(chunk.node_id)
+    # Identify any texts which are too similar to other header texts.
+    header_texts = list(header_candidates.keys())
+    header_distance_matrix = rapidfuzz.process.cdist(header_texts, header_texts, scorer=rapidfuzz.fuzz.ratio, score_cutoff=fuzz_threshold)
+    footer_texts = list(footer_candidates.keys())
+    footer_distance_matrix = rapidfuzz.process.cdist(footer_texts, footer_texts, scorer=rapidfuzz.fuzz.ratio, score_cutoff=fuzz_threshold)
+    # Combine header candidates which are too similar to each other in the distance matrix
+    for i in range(len(header_distance_matrix)-1):
+        for j in range(i+1, len(header_distance_matrix)):
+            if i == j:
+                continue
+            if header_distance_matrix[i][j] >= fuzz_threshold:
+                header_candidates[header_texts[i]].update(header_candidates[header_texts[j]])
+                header_candidates[header_texts[j]].update(header_candidates[header_texts[i]])
+    for i in range(len(footer_distance_matrix)-1):
+        for j in range(i+1, len(footer_distance_matrix)):
+            if i == j:
+                continue
+            if footer_distance_matrix[i][j] >= fuzz_threshold:
+                footer_candidates[footer_texts[i]].update(footer_candidates[footer_texts[j]])
+                footer_candidates[footer_texts[j]].update(footer_candidates[footer_texts[i]])
+    headers_to_remove = set()
+    for chunk_ids in header_candidates.values():
+        if len(chunk_ids) > 1:
+            headers_to_remove.update(chunk_ids)
+    footers_to_remove = set()
+    for chunk_ids in footer_candidates.values():
+        if len(chunk_ids) > 1:
+            footers_to_remove.update(chunk_ids)
+    nodes_to_remove = nodes_to_remove.union(headers_to_remove.union(footers_to_remove))
+    for node_id in nodes_to_remove:
+        pdf_chunks = _remove_chunk(pdf_chunks=pdf_chunks, chunk_id=node_id)
+    return pdf_chunks
+def remove_overlap_images(pdf_chunks: list[GenericNode]) -> list[GenericNode]:
+    # TODO(Jonathan Wang): Implement this function to remove images which are completely overlapping each other
+    # OR... get a better dang reader!
+    raise NotImplementedError
+def chunk_by_header(
+    pdf_chunks_in: list[GenericNode],
+    combine_text_under_n_chars: int = 1024,
+    multipage_sections: bool = True,
+# ) -> Tuple[List[GenericNode], List[GenericNode]]:
+) -> list[GenericNode]:
+    """Combine chunks together that are part of the same header and have similar meaning.
+    Args:
+        pdf_chunks (List[GenericNode]): List of chunks to be combined.
+    Returns:
+        List[GenericNode]: List of combined chunks.
+        List[GenericNode]: List of original chunks, with node references updated.
+    """
+    # TODO(Jonathan Wang): Handle semantic chunking between elements within a Header chunk.
+    # TODO(Jonathan Wang): Handle splitting element chunks if they are over `max_characters` in length (does this ever really happen?)
+    # TODO(Jonathan Wang): Handle relationships between nodes.
+    pdf_chunks = deepcopy(pdf_chunks_in)
+    output = []
+    id_to_index = {}
+    index = 0
+    # Pass 1: Combine chunks together that are part of the same title chunk.
+    while (index < len(pdf_chunks)):
+        chunk = pdf_chunks[index]
+        if (chunk.metadata["type"] in ["Header", "Footer", "Image", "Table"]):
+            # These go immediately into the semantic title chunks and also reset the new node.
+            # Let's add a newline to distinguish from any other content.
+            if (chunk.metadata["type"] in ["Header", "Footer", "Table"]):
+                chunk.text = getattr(chunk, "text", "") + "\n"
+            output.append(chunk)
+            index += 1
+            continue
+        # Make a new node if we have a new title (or if we don't have a title).
+        if (
+            chunk.metadata["type"] == "Title"
+        ):
+            # We're good, this node can stay as a TitleChunk.
+            chunk.metadata['type'] = 'Composite'
+            # if (not isinstance(chunk.metadata['page number'], list)):
+                # chunk.metadata['page number'] = [chunk.metadata['page number']]
+            # Let's add a newline to distinguish the title from the content.
+            setattr(chunk, 'text', getattr(chunk, 'text', '') + "\n")
+            output.append(chunk)
+            id_to_index[chunk.id_] = len(output) - 1
+            index += 1
+            continue
+        elif (chunk.metadata.get('parent_id', None) in id_to_index):
+            # This chunk is part of the same title as a prior chunk.
+            # Add this text into the prior title node.
+            jndex = id_to_index[chunk.metadata['parent_id']]
+            # if (not isinstance(output[jndex].metadata['page number'], list)):
+                # output[jndex].metadata['page number'] = [chunk.metadata['page number']]
+            output[jndex] = _combine_chunks(output[jndex], chunk)
+            # output[jndex].text = getattr(output[jndex], 'text', '') + '\n' + getattr(chunk, 'text', '')
+            # output[jndex].metadata['page number'] = list(set(output[jndex].metadata['page number'] + [chunk.metadata['page number']]))
+            # output[jndex].metadata['languages'] = list(set(output[jndex].metadata['languages'] + chunk.metadata['languages']))
+            pdf_chunks.remove(chunk)
+            continue
+        elif (
+            (chunk.metadata.get('parent_id', None) is None)
+            and (
+                len(getattr(chunk, 'text', '')) > combine_text_under_n_chars  # big enough text section to stand alone
+                or (len(id_to_index.keys()) <= 0)  # no prior title
+            )
+        ):
+            # Okay, so either we don't have a title, or it was interrupted by an image / table.
+            # This chunk can stay as a TextChunk.
+            chunk.metadata['type'] = 'Composite-TextOnly'
+            # if (not isinstance(chunk.metadata['page number'], list)):
+                # chunk.metadata['page number'] = [chunk.metadata['page number']]
+            output.append(chunk)
+            id_to_index[chunk.id_] = len(output) - 1
+            index += 1
+            continue
+        else:
+            # Add the text to the prior node that isn't a table or image.
+            jndex = len(output) - 1
+            while (
+                (jndex >= 0)
+                and (output[jndex].metadata['type'] in ['Table', 'Image'])
+            ):
+                # for title_chunk in output:
+                    # print(f'''{title_chunk.id_}: {title_chunk.metadata['type']}, text: {title_chunk.text}, parent: {title_chunk.metadata['parent_id']}''')
+                jndex -= 1
+            if (jndex < 0):
+                raise Exception(f'''Prior title chunk not found: {index}, {chunk.metadata.get('parent_id', None)}''')
+            # Add this text into the prior title node.
+            # if (not isinstance(output[jndex].metadata['page number'], list)):
+                # output[jndex].metadata['page number'] = [chunk.metadata['page number']]
+            output[jndex] = _combine_chunks(output[jndex], chunk)
+            # output[jndex].text = getattr(output[jndex], 'text', '') + ' ' + getattr(chunk, 'text', '')
+            # output[jndex].metadata['page number'] = list(set(output[jndex].metadata['page number'] + [chunk.metadata['page number']]))
+            # output[jndex].metadata['languages'] = list(set(output[jndex].metadata['languages'] + chunk.metadata['languages']))
+            pdf_chunks.remove(chunk)
+            # TODO: Update relationships between nodes.
+            continue
+    return (output)
+### TODO:
+# Merge images together that are substantially overlapping.
+# Favour image with no confidence score. (these come straight from pdf).
+# Favour the larger image over the smaller one.
+# Favour the image with higher confidence score.
+def merge_images() -> None:
+    pass

prompts.py ADDED Viewed

	@@ -0,0 +1,86 @@

+#####################################################
+### DOCUMENT PROCESSOR [PROMPTS]
+#####################################################
+# Jonathan Wang
+# ABOUT:
+# This project creates an app to chat with PDFs.
+# This is the prompts sent to the LLM.
+#####################################################
+## TODOS:
+# Use the row names instead of .at indesx locators
+# This is kinda dumb because we read the same .csv file over again
+    # Should we structure this abstraction differently?
+#####################################################
+## IMPORTS:
+import pandas as pd
+from llama_index.core import PromptTemplate
+#####################################################
+## CODE:
+# https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/prompts/default_prompts.py
+QA_PROMPT = """Context information is below.\n
+---------------------
+{context_str}
+---------------------
+Given the context information, answer the query.
+You must adhere to the following rules:
+- Use the context information, not prior knowledge.
+- End the answer with any brief quote(s) from the context that are the most essential in answering the question.
+    - If the context is not helpful in answering the question, do not include a quote.
+Query: {query_str}
+Answer: """
+# https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/prompts/default_prompts.py
+REFINE_PROMPT = """The original query is as follows: {query_str}
+We have provided an existing answer: {existing_answer}
+We have the opportunity to refine the existing answer (only if needed) with some more context below.
+---------------------
+{context_msg}
+---------------------
+Given the new context, refine the original answer to better answer the query.
+You must adhere to the following rules:
+- If the context isn't useful, return the original answer.
+- End the answer with any brief quote(s) from the original answer or new context that are the most essential in answering the question.
+    - If the new context is not helpful in answering the question, leave the original answer unchanged.
+Refined Answer: """
+def get_qa_prompt(
+    # prompt_file_path: str
+) -> PromptTemplate:
+    """Given a path to the prompts, get prompt for Question-Answering"""
+    # prompts = pd.read_csv(prompt_file_path)
+    # https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/prompts/default_prompts.py
+    custom_qa_prompt = PromptTemplate(
+        QA_PROMPT
+    )
+    return (custom_qa_prompt)
+def get_refine_prompt(
+    # prompt_file_path: str
+) -> PromptTemplate:
+    """Given a path to the prompts, get prompt to Refine answer after new info"""
+    # prompts = pd.read_csv(prompt_file_path)
+    # https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/prompts/default_prompts.py
+    custom_refine_prompt = PromptTemplate(
+        REFINE_PROMPT
+    )
+    return (custom_refine_prompt)
+# def get_reqdoc_prompt(
+#     prompt_file_path: str
+# ) -> PromptTemplate:
+#     """Given a path to the prompts, get prompt to identify requested info from document."""
+#     prompts = pd.read_csv(prompt_file_path)
+#     # https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/prompts/default_prompts.py
+#     reqdoc_prompt = PromptTemplate(
+#         prompts.at[2, 'Prompt']
+#     )
+#     return (reqdoc_prompt)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+# https://www.reddit.com/r/Python/comments/13h2xuc/any_musthave_extensions_for_working_with_python/
+[tool.isort]
+profile = "black"
+[tool.mypy]
+warn_unused_configs = true
+exclude = "archives|build|docs"
+show_column_numbers = true
+show_error_codes = true
+strict = true
+plugins = ["numpy.typing.mypy_plugin"]
+[tool.ruff]
+select = ["ALL"]
+ignore = [
+    "ANN101", # Missing type annotation for self in method
+    "COM",    # flake8-commas
+    "D100",   # Missing docstring in public module
+    "D101",   # Missing docstring in public class
+    "D102",   # Missing docstring in public method
+    "D103",   # Missing docstring in public function
+    "D104",   # Missing docstring in public package
+    "D406",   # Section name should end with a newline
+    "D407",   # Missing dashed underline after section
+    "FBT",    # flake8-boolean-trap
+    "G004",   # Logging statement uses f-string
+    # "PD901",  # df is a bad variable name. Be kinder to your future self.
+    "PTH123", # open() should be replaced by Path.open()
+    "RET505", # Unnecessary `elif` after `return` statement  (I think this improves readability)
+    "RET506", # Unnecessary `else` after `return` statement  (I think this improves readability)
+    "T20",    # flake8-print
+    "TD003",  # Missing issue link on the line following this TODO  (I don't have an issue system)
+]
+src = ["src"]
+[tool.ruff.per-file-ignores]
+"tests/**/*.py" = [
+    "S101", # Use of assert detected
+]
+[tool.ruff.pydocstyle]
+convention = "numpy"
+[tool.pyright]
+typeCheckingMode = "strict"
+reportMissingTypeStubs = false
+reportPrivateUsage = false
+reportUnknownArgumentType = false
+reportUnknownMemberType = false
+reportUnknownParameterType = false
+reportUnknownVariableType = false

requirements.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+--extra-index-url https://download.pytorch.org/whl/cu124
+torch>=2.4.0+cu124
+torchaudio>=2.4.0+cu124
+torchvision>=0.19.0+cu124
+transformers>=4.41.1
+accelerate>=0.28.0
+quanto
+optimum-quanto  # bitsandbytes replacement, seems better?
+sentence-transformers
+einops
+einops_exts
+open_clip_torch>=2.24.0
+treelib
+nltk>=3.9
+# multi-rake
+yake
+symspellpy
+rapidfuzz
+streamlit
+streamlit-pdf-viewer
+opencv-python
+pdf2image
+pytesseract
+pdfplumber>=0.11.3
+pdfminer.six>=20231228  # fixes infinite loop from unstructured[all-docs] of PDFMiner Read In?
+unstructured[all-docs]>=0.15.5
+llama-index-core
+llama-index-embeddings-huggingface
+llama-index-vector-stores-qdrant
+llama-index-retrievers-bm25
+llama-index-llms-huggingface
+llama-index-llms-groq
+llama-index-question-gen-openai  # required for subquestionqueryengine
+llama-index-multi-modal-llms-openai

retriever.py ADDED Viewed

	@@ -0,0 +1,280 @@

+#####################################################
+### DOCUMENT PROCESSOR [RETRIEVER]
+#####################################################
+# Jonathan Wang
+# ABOUT:
+# This project creates an app to chat with PDFs.
+# This is the RETRIEVER
+# which defines the main way that document
+# snippets are identified.
+#####################################################
+## TODO:
+#####################################################
+## IMPORTS:
+import logging
+from typing import Optional, List, Tuple, Dict, cast
+from collections import defaultdict
+import streamlit as st
+import numpy as np
+from llama_index.core.utils import truncate_text
+from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever
+from llama_index.retrievers.bm25 import BM25Retriever
+from llama_index.core import VectorStoreIndex #, StorageContext,
+from llama_index.core.schema import BaseNode, IndexNode, NodeWithScore, QueryBundle
+from llama_index.core.callbacks.base import CallbackManager
+# Own Modules:
+from merger import _merge_on_scores
+# Lazy Loading:
+#####################################################
+## CODE:
+class RAGRetriever(BaseRetriever):
+    """
+    Jonathan Wang's custom built retriever over our vector store.
+    Combination of Hybrid Retrieval (BM25 x Vector Embeddings) + AutoMergingRetriever
+    https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/retrievers/auto_merging_retriever.py
+    """
+    def __init__(
+        self,
+        vector_store_index: VectorStoreIndex,
+        semantic_top_k: int = 10,
+        sparse_top_k: int = 6,
+        fusion_similarity_top_k: int = 10,  # total number of snippets to retrieve after the Reicprocal Rerank.
+        semantic_weight_fraction: float = 0.6,  # percentage weight to give to semantic cosine vs sparse bm25
+        merge_up_thresh: float = 0.5,  # fraction of nodes needed to be retrieved to merge up to semantic level
+        verbose: bool = True,
+        callback_manager: Optional[CallbackManager] = None,
+        object_map: Optional[dict] = None,
+        objects: Optional[List[IndexNode]] = None,
+    ) -> None:
+        """Init params."""
+        self._vector_store_index = vector_store_index
+        self.sentence_vector_retriever = VectorIndexRetriever(
+            index=vector_store_index, similarity_top_k=semantic_top_k
+        )
+        self.sentence_bm25_retriever = BM25Retriever.from_defaults(
+            # nodes=list(vector_store_index.storage_context.docstore.docs.values())
+            index=vector_store_index  # TODO: Confirm this works.
+            , similarity_top_k=sparse_top_k
+        )
+        self._fusion_similarity_top_k = fusion_similarity_top_k
+        self._semantic_weight_fraction = semantic_weight_fraction
+        self._merge_up_thresh = merge_up_thresh
+        super().__init__(
+            # callback_manager=callback_manager,
+            object_map=object_map,
+            objects=objects,
+            verbose=verbose,
+        )
+    @classmethod
+    def class_name(cls) -> str:
+        """Class name."""
+        return "RAGRetriever"
+    def _get_parents_and_merge(
+        self, nodes: List[NodeWithScore]
+    ) -> Tuple[List[NodeWithScore], bool]:
+        """Get parents and merge nodes."""
+        # retrieve all parent nodes
+        parent_nodes: Dict[str, BaseNode] = {}
+        parent_cur_children_dict: Dict[str, List[NodeWithScore]] = defaultdict(list)
+        for node in nodes:
+            if node.node.parent_node is None:
+                continue
+            parent_node_info = node.node.parent_node
+            # Fetch actual parent node if doesn't exist in `parent_nodes` cache yet
+            parent_node_id = parent_node_info.node_id
+            if parent_node_id not in parent_nodes:
+                parent_node = self._vector_store_index.storage_context.docstore.get_document(
+                    parent_node_id
+                )
+                parent_nodes[parent_node_id] = cast(BaseNode, parent_node)
+            # add reference to child from parent
+            parent_cur_children_dict[parent_node_id].append(node)
+        # compute ratios and "merge" nodes
+        # merging: delete some children nodes, add some parent nodes
+        node_ids_to_delete = set()
+        nodes_to_add: Dict[str, BaseNode] = {}
+        for parent_node_id, parent_node in parent_nodes.items():
+            parent_child_nodes = parent_node.child_nodes
+            parent_num_children = len(parent_child_nodes) if parent_child_nodes else 1
+            parent_cur_children = parent_cur_children_dict[parent_node_id]
+            ratio = len(parent_cur_children) / parent_num_children
+            # if ratio is high enough, merge up to the next level in the hierarchy
+            if ratio > self._merge_up_thresh:
+                node_ids_to_delete.update(
+                    set({n.node.node_id for n in parent_cur_children})
+                )
+                parent_node_text = truncate_text(getattr(parent_node, 'text', ''), 100)
+                info_str = (
+                    f"> Merging {len(parent_cur_children)} nodes into parent node.\n"
+                    f"> Parent node id: {parent_node_id}.\n"
+                    f"> Parent node text: {parent_node_text}\n"
+                )
+                # logger.info(info_str)
+                if self._verbose:
+                    print(info_str)
+                # add parent node
+                # can try averaging score across embeddings for now
+                avg_score = sum(
+                    [n.get_score() or 0.0 for n in parent_cur_children]
+                ) / len(parent_cur_children)
+                parent_node_with_score = NodeWithScore(
+                    node=parent_node, score=avg_score
+                )
+                nodes_to_add[parent_node_id] = parent_node_with_score  # type: ignore (NodesWithScore is a child of BaseNode)
+        # delete old child nodes, add new parent nodes
+        new_nodes = [n for n in nodes if n.node.node_id not in node_ids_to_delete]
+        # add parent nodes
+        new_nodes.extend(list(nodes_to_add.values()))  # type: ignore (NodesWithScore is a child of BaseNode)
+        is_changed = len(node_ids_to_delete) > 0
+        return new_nodes, is_changed
+    def _fill_in_nodes(
+        self, nodes: List[NodeWithScore]
+    ) -> Tuple[List[NodeWithScore], bool]:
+        """Fill in nodes."""
+        new_nodes = []
+        is_changed = False
+        for idx, node in enumerate(nodes):
+            new_nodes.append(node)
+            if idx >= len(nodes) - 1:
+                continue
+            cur_node = cast(BaseNode, node.node)
+            # if there's a node in the middle, add that to the queue
+            if (
+                cur_node.next_node is not None
+                and cur_node.next_node == nodes[idx + 1].node.prev_node
+            ):
+                is_changed = True
+                next_node = self._vector_store_index.storage_context.docstore.get_document(
+                    cur_node.next_node.node_id
+                )
+                next_node = cast(BaseNode, next_node)
+                next_node_text = truncate_text(getattr(next_node, 'text', ''), 100)  # TODO: why not higher?
+                info_str = (
+                    f"> Filling in node. Node id: {cur_node.next_node.node_id}"
+                    f"> Node text: {next_node_text}\n"
+                )
+                # logger.info(info_str)
+                if self._verbose:
+                    print(info_str)
+                # set score to be average of current node and next node
+                avg_score = (node.get_score() + nodes[idx + 1].get_score()) / 2
+                new_nodes.append(NodeWithScore(node=next_node, score=avg_score))
+        return new_nodes, is_changed
+    def _try_merging(
+        self, nodes: List[NodeWithScore]
+    ) -> Tuple[List[NodeWithScore], bool]:
+        """Try different ways to merge nodes."""
+        # first try filling in nodes
+        nodes, is_changed_0 = self._fill_in_nodes(nodes)
+        # then try merging nodes
+        nodes, is_changed_1 = self._get_parents_and_merge(nodes)
+        return nodes, is_changed_0 or is_changed_1
+    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
+        """Retrieve."""
+        # Get vector stores retrieved nodes
+        vector_sentence_nodes = self.sentence_vector_retriever.retrieve(query_bundle)# , **kwargs)
+        bm25_sentence_nodes = self.sentence_bm25_retriever.retrieve(query_bundle)# , **kwargs)
+        # Get initial nodes from hybrid search.
+        initial_nodes = _merge_on_scores(
+            vector_sentence_nodes,
+            bm25_sentence_nodes,
+            [getattr(a, "score", np.nan) for a in vector_sentence_nodes],
+            [getattr(b, "score", np.nan) for b in bm25_sentence_nodes],
+            a_weight=self._semantic_weight_fraction,
+            top_k=self._fusion_similarity_top_k
+        )
+        # Merge nodes
+        cur_nodes, is_changed = self._try_merging(list(initial_nodes))  # technically _merge_on_scores returns a sequence.
+        while is_changed:
+            cur_nodes, is_changed = self._try_merging(cur_nodes)
+        # sort by similarity
+        cur_nodes.sort(key=lambda x: x.get_score(), reverse=True)
+        # some other reranking and filtering node postprocessors here?
+        # https://docs.llamaindex.ai/en/stable/module_guides/querying/node_postprocessors/root.html
+        return cur_nodes
+@st.cache_resource
+def get_retriever(
+    _vector_store_index: VectorStoreIndex,
+    semantic_top_k: int = 10,
+    sparse_top_k: int = 6,
+    fusion_similarity_top_k: int = 10,  # total number of snippets to retrieve after the Reicprocal Rerank.
+    semantic_weight_fraction: float = 0.6,  # percentage weight to give to semantic chunks over sentence chunks
+    merge_up_thresh: float = 0.5,  # fraction of nodes needed to be retrieved to merge up to semantic level
+    verbose: bool = True,
+    _callback_manager: Optional[CallbackManager] = None,
+    object_map: Optional[dict] = None,
+    objects: Optional[List[IndexNode]] = None,
+) -> BaseRetriever:
+    """Get the retriver to use.
+    Args:
+        vector_store_index (VectorStoreIndex): The vector store to query on.
+        semantic_top_k (int, optional): Top k nodes to retrieve semantically (cosine). Defaults to 10.
+        sparse_top_k (int, optional): Top k nodes to retrieve sparsely (BM25). Defaults to 6.
+        fusion_similarity_top_k (int, optional): Maximum number of nodes to retrieve after fusing. Defaults to 10.
+        callback_manager (Optional[CallbackManager], optional): Callback manager. Defaults to None.
+        object_map (Optional[dict], optional): Object map. Defaults to None.
+        objects (Optional[List[IndexNode]], optional): Objects list. Defaults to None.
+    Returns:
+        BaseRetriever: Retriever to use.
+    """
+    retriever = RAGRetriever(
+        vector_store_index=_vector_store_index,
+        semantic_top_k=semantic_top_k,
+        sparse_top_k=sparse_top_k,
+        fusion_similarity_top_k=fusion_similarity_top_k,
+        semantic_weight_fraction=semantic_weight_fraction,
+        merge_up_thresh=merge_up_thresh,
+        verbose=verbose,
+        callback_manager=_callback_manager,
+        object_map=object_map,
+        objects=objects
+    )
+    return (retriever)

storage.py ADDED Viewed

	@@ -0,0 +1,120 @@

+#####################################################
+### DOCUMENT PROCESSOR [STORAGE]
+#####################################################
+# Jonathan Wang
+# ABOUT:
+# This project creates an app to chat with PDFs.
+# This is the setup for the Storage in the RAG pipeline.
+#####################################################
+## TODOS:
+# Handle creating multiple vector stores, one for each document which has been processed (?)
+#####################################################
+## IMPORTS:
+import gc
+from torch.cuda import empty_cache
+from typing import Optional, IO, List, Tuple
+import streamlit as st
+import qdrant_client
+from llama_index.core import StorageContext
+from llama_index.core.storage.docstore.types import BaseDocumentStore
+from llama_index.core.storage.docstore import SimpleDocumentStore
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+from llama_index.core import VectorStoreIndex
+from llama_index.core.settings import Settings
+from llama_index.core.base.embeddings.base import BaseEmbedding
+from llama_index.core.node_parser import NodeParser
+# Reader and processing
+from pdf_reader import UnstructuredPDFReader
+from pdf_reader_utils import clean_abbreviations, dedupe_title_chunks, combine_listitem_chunks, remove_header_footer_repeated, chunk_by_header
+from metadata_adder import UnstructuredPDFPostProcessor
+#####################################################
+# Get Vector Store
+@st.cache_resource
+def get_vector_store() -> QdrantVectorStore:
+    qdr_client = qdrant_client.QdrantClient(
+        location=":memory:"
+    )
+    qdr_aclient = qdrant_client.AsyncQdrantClient(
+        location=":memory:"
+    )
+    return QdrantVectorStore(client=qdr_client, aclient=qdr_aclient, collection_name='pdf', prefer_grpc=True)
+# Get Document Store from List of Documents
+# @st.cache_resource  # can't hash a list.
+def get_docstore(documents: List) -> BaseDocumentStore:
+    """Get the document store from a list of documents."""
+    docstore = SimpleDocumentStore()
+    docstore.add_documents(documents)
+    return docstore
+# Get storage context and
+# @st.cache_resource  # can't cache the pdf_reader or vector_store
+# def pdf_to_storage(
+#     pdf_file_path: Optional[str],
+#     pdf_file: Optional[IO[bytes]],
+#     _pdf_reader: UnstructuredPDFReader,
+#     _embed_model: BaseEmbedding,
+#     _node_parser: Optional[NodeParser] = None,
+#     _pdf_postprocessor: Optional[UnstructuredPDFPostProcessor] = None,
+#     _vector_store: Optional[QdrantVectorStore]=None,
+# ) -> Tuple[StorageContext, VectorStoreIndex]:
+#     """Read in PDF and save to storage."""
+#     # Read the PDF with the PDF reader
+#     pdf_chunks = _pdf_reader.load_data(pdf_file_path=pdf_file_path, pdf_file=pdf_file)
+#     # Clean the PDF chunks
+#     # Insert any cleaners here.
+#     # TODO: Cleaners to remove repeated header/footer text, overlapping elements, ...
+#     pdf_chunks = clean_abbreviations(pdf_chunks)
+#     pdf_chunks = dedupe_title_chunks(pdf_chunks)
+#     pdf_chunks = combine_listitem_chunks(pdf_chunks)
+#     pdf_chunks = remove_header_footer_repeated(pdf_chunks)
+#     empty_cache()
+#     gc.collect()
+#     # Postprocess the PDF nodes.
+#     if (_node_parser is None):
+#         _node_parser = Settings.node_parser
+#     # Combine by semantic headers
+#     pdf_chunks = chunk_by_header(pdf_chunks, 1000)
+#     pdf_chunks = _node_parser.get_nodes_from_documents(pdf_chunks)
+#     if (_pdf_postprocessor is not None):
+#         pdf_chunks = _pdf_postprocessor(pdf_chunks)
+#     # Add embeddings
+#     pdf_chunks = _embed_model(pdf_chunks)
+#     # Create Document Store
+#     docstore = get_docstore(documents=pdf_chunks)
+#     # Create Vector Store if not provided
+#     if (_vector_store is None):
+#         _vector_store = get_vector_store()
+#     ## TODO: Handle images in StorageContext.
+#     # Save into Storage
+#     storage_context = StorageContext.from_defaults(
+#         docstore=docstore,
+#         vector_store=_vector_store
+#     )
+#     vector_store_index = VectorStoreIndex(
+#         pdf_chunks, storage_context=storage_context
+#     )
+#     return (storage_context, vector_store_index)

summary.py ADDED Viewed

	@@ -0,0 +1,246 @@

+#####################################################
+### DOCUMENT PROCESSOR [Summarizer]
+#####################################################
+### Jonathan Wang
+# ABOUT:
+# This creates an app to chat with PDFs.
+# This is the Summarizer
+# Which creates summaries based on documents.
+#####################################################
+### TODO Board:
+# Summary Index for document?
+# https://docs.llamaindex.ai/en/stable/examples/response_synthesizers/tree_summarize/
+# https://sourajit16-02-93.medium.com/text-summarization-unleashed-novice-to-maestro-with-llms-and-instant-code-solutions-8d26747689c4
+#####################################################
+### PROGRAM SETTINGS
+#####################################################
+### PROGRAM IMPORTS
+import logging
+from typing import Optional, Sequence, Any, Callable, cast
+from llama_index.core.bridge.pydantic import Field, PrivateAttr
+from llama_index.core.settings import Settings
+from llama_index.core.base.llms.base import BaseLLM
+from llama_index.core.multi_modal_llms import MultiModalLLM
+from llama_index.core.schema import BaseNode, TextNode, ImageDocument
+from llama_index.core.callbacks.base import CallbackManager
+from llama_index.core.response_synthesizers import TreeSummarize
+# Own Modules
+from metadata_adder import ModelMetadataAdder
+#####################################################
+### CONSTANTS
+logger = logging.getLogger(__name__)
+DEFAULT_SUMMARY_TEMPLATE = """You are an expert summarizer of information. You are given some information from a document. Summarize the information, and then provide the key information that can be drawn from it. The information is below:
+{context_str}
+"""
+DEFAULT_ONELINE_SUMMARY_TEMPLATE = """You are an expert summarizer of information. You are given a summary of a document. In no more than three sentences, describe the subject of the document, the main ideas of the document, and what types of questions can be answered from it."""
+DEFAULT_TREE_SUMMARY_TEMPLATE = """You are an expert summarizer of information. You are given some text from a document.
+Please provide a comprehensive summary of the text.
+Include the main subject of the text, the key points or topics, and the most important conclusions if there are any.
+The summary should be detailed yet concise."""
+DEFAULT_TABLE_SUMMARY_TEMPLATE = """You are an expert summarizer of tables. You are given a table or part of a table in HTML format. The table is below:
+{context_str}
+----------------
+Summarize the table, and then provide the key insights that can be drawn directly from the table. If this is not actually an HTML table or part of an HTML table, please do not respond.
+"""
+DEFAULT_IMAGE_SUMMARY_TEMPLATE = """You are an expert image summarizer. You are given an image. Summarize the image, and then provide the key insights that can be drawn directly from the image, if there are any.
+"""
+#####################################################
+### SCRIPT
+class TextSummaryMetadataAdder(ModelMetadataAdder):
+    """Adds metadata to nodes based on a language model."""
+    _llm: BaseLLM = PrivateAttr()
+    def __init__(
+        self,
+        metadata_name: str,
+        llm: Optional[BaseLLM] = None,
+        prompt_template: Optional[str] = DEFAULT_SUMMARY_TEMPLATE,
+        **kwargs: Any
+    ) -> None:
+        """Init params."""
+        llm = llm or Settings.llm
+        prompt_template = prompt_template if prompt_template is not None else DEFAULT_SUMMARY_TEMPLATE
+        super().__init__(metadata_name=metadata_name, prompt_template=prompt_template, **kwargs)
+    @classmethod
+    def class_name(cls) -> str:
+        return "TextSummaryMetadataAdder"
+    def get_node_metadata(self, node: BaseNode) -> Optional[str]:
+        if (getattr(node, 'text', None) is None):
+            return None
+        response = self._llm.complete(prompt=self.prompt_template.format(context_str=node.text))
+        return response.text
+class TableSummaryMetadataAdder(ModelMetadataAdder):
+    """Adds table summary metadata to a document.
+    Args:
+        metadata_name: The name of the metadata to add to the document. Defaults to 'table_summary'.
+        llm: The LLM to use to generate the table summary. Defaults to Settings llm.
+        prompt_template: The prompt template to use to generate the table summary. Defaults to DEFAULT_TABLE_SUMMARY_TEMPLATE.
+    """
+    _llm: BaseLLM = PrivateAttr()
+    def __init__(
+        self,
+        metadata_name: str = "table_summary",  ## TODO: This is a bad pattern, string should not be hardcoded like this
+        llm: Optional[BaseLLM] = None,
+        prompt_template: Optional[str] = DEFAULT_TABLE_SUMMARY_TEMPLATE,
+        # num_workers: int = 1,
+        **kwargs: Any,
+    ) -> None:
+        """Init params."""
+        llm = llm or Settings.llm
+        prompt_template = prompt_template or DEFAULT_TABLE_SUMMARY_TEMPLATE
+        super().__init__(metadata_name=metadata_name, prompt_template=prompt_template, **kwargs)
+        self._llm = llm
+    @classmethod
+    def class_name(cls) -> str:
+        return "TableSummaryMetadataAdder"
+    def get_node_metadata(self, node: BaseNode) -> Optional[str]:
+        """Given a node, get the metadata for the node using the language model."""
+        ## NOTE: Our PDF Reader parser distringuishes between TextNode and TableNode using the 'orignal_table_text' attribute.
+        ## BUG (future): `orignal_table_text` should not be hardcoded.
+        if (not isinstance(node, TextNode)):
+            return None
+        if (node.metadata.get('orignal_table_text') is None):
+            return None
+        if (getattr(node, 'text', None) is None):
+            return None
+        response = self._llm.complete(
+            self.prompt_template.format(context_str=node.text)
+        )
+        return response.text
+class ImageSummaryMetadataAdder(ModelMetadataAdder):
+    """Adds image summary metadata to a document.
+    Args:
+        metadata_name: The name of the metadata to add to the document. Defaults to 'table_summary_metadata'.
+    """
+    _llm: MultiModalLLM = PrivateAttr()
+    def __init__(
+        self,
+        llm: MultiModalLLM,
+        prompt_template: str = DEFAULT_IMAGE_SUMMARY_TEMPLATE,
+        metadata_name: str = 'image_summary',
+        **kwargs: Any,
+    ) -> None:
+        """Init params."""
+        super().__init__(metadata_name=metadata_name, prompt_template=prompt_template, **kwargs)
+        self._llm = llm
+    @classmethod
+    def class_name(cls) -> str:
+        return "ImageSummaryMetadataAdder"
+    def _get_image_node_metadata(self, node: BaseNode) -> Optional[str]:
+        """Handles getting images from image nodes.
+        Args:
+            node (BaseNode): The image node to get the image summary for. NOTE: This can technically be any type of node so long as it has an image stored.
+        Returns:
+            Optional[str]: The image summary if it exists. If not, return None.
+        """
+        if (
+            ((getattr(node, 'image', None) is None) and (getattr(node, 'image_path', None) is None))
+            or (not callable(getattr(node, "resolve_image", None)))  # method used to convert node to PILImage for model.
+        ):
+            # Not a valid image node with image attributes and image conversion.
+            return None
+        # Check whethr the image is of text or not
+        ### TODO: Replace this with a text-overlap thing.
+        image = node.resolve_image() # type: ignore | we check for this above.
+        im_width, im_height = image.size
+        if (im_width < 70):  # TODO: this really should be based on the average text width / whether this is overlapping text.
+            return None
+        ## NOTE: We're assuming that the llm complete function has a parameter `images` to send image node(s) as input.
+        ## This is NOT necessarily true if the end user decides to create their own implementation of a MultiModalLLM.
+        response = self._llm.complete(
+            prompt=self.prompt_template,
+            image_documents=[
+                cast(ImageDocument, node)  # NOTE: This is a hack. Technically, node should be an ImageNode, a parent of ImageDocument; but I don't think we'll be using the Document features so this should be okay.
+            ],
+        )
+        return response.text
+    def _get_composite_node_metadata(self, node: BaseNode) -> Optional[str]:
+        """Handles getting images from composite nodes (i.e., where an image is stored as a original node inside a composite node).
+        Args:
+            node (TextNode): The composite node to get the image summary for.
+        Returns:
+            Optional[str]: The image summary if it exists. If not, return None.
+        """
+        if ('orig_nodes' not in node.metadata):
+            return None  # no image nodes in the composite node.
+        output = ""
+        for orig_node in node.metadata['orig_nodes']:
+            output += str(self._get_image_node_metadata(orig_node) or "")
+        if (output == ""):
+            return None
+        return output
+    def get_node_metadata(self, node: BaseNode) -> Optional[str]:
+        """Get the image summary for a node (or subnodes)."""
+        if (node.metadata['type'].startswith('Composite')):
+            return self._get_composite_node_metadata(node)
+        else:
+            return self._get_image_node_metadata(node)
+def get_tree_summarizer(
+    llm: Optional[BaseLLM] = None,
+    callback_manager: Optional[CallbackManager] = None,
+):
+    llm = llm or Settings.llm
+    tree_summarizer = TreeSummarize(llm=llm, callback_manager=callback_manager)
+    return (tree_summarizer)
+def get_tree_summary(tree_summarizer: TreeSummarize, text_chunks: Sequence[BaseNode]) -> str:
+    """Summarize the text nodes using a tree summarizer.
+    Args:
+        tree_summarizer (TreeSummarize): The tree summarizer to use.
+        text_chunks (Sequence[BaseNode]): The text nodes to summarize.
+    Returns:
+        str: The summarized text.
+    """
+    response = tree_summarizer.aget_response(query_str=DEFAULT_TREE_SUMMARY_TEMPLATE, text_chunks=[getattr(text_chunks, 'text') for text_chunks in text_chunks if hasattr(text_chunks, 'text')])
+    return response.response