Spaces:

elia-waefler
/

ki_rag_classify

Runtime error

App Files Files Community

elia-waefler commited on May 22

Commit

c2b923e

•

1 Parent(s): b863ba1

Upload 17 files

Browse files

Files changed (17) hide show

.gitattributes +37 -35
.gitignore +157 -0
LICENSE +201 -0
README.md +10 -13
_IFC-checker.py +2 -0
_ecodomus.py +40 -0
_local_embeddings.py +2 -0
app.py +309 -0
ingest.py +126 -0
kbob_file_handler.py +106 -0
my_1_reader.py +201 -0
my_1_writer.py +98 -0
my_2_sim_search.py +163 -0
my_new_openai.py +153 -0
my_vectors.py +17 -0
requirements.txt +15 -0
setup_db.py +50 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,37 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+U3_alle/faiss_index.index/index.faiss filter=lfs diff=lfs merge=lfs -text
+KBOB_Klassifizierung.xlsx filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,157 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+env/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,13 +1,10 @@
----
-title: Ki Rag Classify
-emoji: 🦀
-colorFrom: gray
-colorTo: pink
-sdk: streamlit
-sdk_version: 1.34.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: ki_inselspital
+emoji: 🏥
+colorFrom: yellow
+colorTo: indigo
+sdk: streamlit
+sdk_version: 1.33.0
+app_file: app.py
+pinned: false
+---

_IFC-checker.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # nicht aktuell
2	+ # my utils

_ecodomus.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import requests
+import urllib.parse
+import os
+# Assuming environment variables are used to store sensitive data
+client_id = 'Siemens.Advanta'
+client_secret = os.environ.get('SIEMENS_API_KEY')
+username = "I0340828"
+password = os.environ["SIEMENS_EW_PW"]
+# Endpoint
+url = "https://eu-ecodomus-services.siemens.com/api/token HTTP/1.1"
+# Data needs to be URL-encoded
+data = {
+    'client_id': client_id,
+    'client_secret': client_secret,
+    'username': username,
+    'password': password,
+    'grant_type': 'password'
+}
+encoded_data = urllib.parse.urlencode(data)
+print(encoded_data)
+# Headers
+headers = {
+    'Content-Type': 'application/x-www-form-urlencoded'
+}
+# POST Request
+response = requests.post(url, data=encoded_data, headers=headers)
+print("Status Code:", response.status_code)
+print("Response Content:", response.text)
+print("content", response.content)
+if response.status_code == 200:
+    access_token = response.json().get('access_token')
+    print("Access Token:", access_token)
+else:
+    print("Failed to fetch access token. Check response content for details.")

_local_embeddings.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #nicht aktuell
2	+ # my Utils

app.py ADDED Viewed

	@@ -0,0 +1,309 @@

+"""
+testing my own vectors
+list comprehension whenever possible
+main function
+if name == main
+reusable functions that do just one specific task
+type checking
+def my_function(in_one: str, in_two: int) -> None:
+pip install mypy for static typechecking.
+O Gebäudebetrieb
+Reinigung
+FM Prozesse nicht für klassifizierung
+Phase auch nicht. IMMER 53!!
+VISION: AUTOMATISCHE BENENNUNG BEI ECODOMUS UPLOAD
+Automatische metadatenzuodrdnung
+"""
+import json
+import ingest
+import my_1_writer
+import my_2_sim_search
+import my_vectors
+import setup_db
+import my_new_openai
+import time
+import streamlit as st
+import os
+from PIL import Image
+import json
+from typing import Any, Dict
+def read_json_file(file_path: str) -> Dict[str, Any]:
+    """
+    Diese Funktion liest den Inhalt einer JSON-Datei und gibt ihn als Wörterbuch zurück.
+    Argumente:
+    file_path (str): Der Dateipfad zur JSON-Datei.
+    Rückgabewert:
+    Dict[str, Any]: Der Inhalt der JSON-Datei als DICT
+    ANY ist oft ein VECTOR = list[float]
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            content = json.load(file)
+        return content
+    except Exception as e:
+        return {"error": str(e)}
+#test this:
+def extract_tables_from_page_advanced(page):
+    """Extrahiert einfache Tabellenstrukturen basierend auf Textblöcken einer Seite."""
+    text_blocks = page.get_text("blocks")
+    text_blocks = sorted(text_blocks, key=lambda block: (block[1], block[0]))  # Nach Y, dann X sortieren
+    # Erstellen eines Histogramms der X-Startpunkte, um Spalten zu identifizieren
+    column_threshold = 10  # Mindestabstand zwischen verschiedenen Spalten
+    columns = {}
+    for block in text_blocks:
+        x_start = block[0]
+        found_column = False
+        for col in columns.keys():
+            if abs(x_start - col) < column_threshold:
+                columns[col].append(block)
+                found_column = True
+                break
+        if not found_column:
+            columns[x_start] = [block]
+    # Tabellenzeilen basierend auf den identifizierten Spalten extrahieren
+    tables = []
+    for col, blocks in columns.items():
+        table = []
+        for block in sorted(blocks, key=lambda block: block[1]):  # Nach Y sortieren
+            table.append(block[4].strip())  # Text des Blocks hinzufügen
+        tables.append(table)
+    return tables
+def merge_indices(index1, index2):
+    """
+    Merge two indices into a new index, assuming both are of the same type and dimensionality.
+    """
+    pass
+def handle_userinput(user_question):
+    pass
+def save_uploaded_file(uploaded_file):
+    try:
+        # Create a static folder if it doesn't exist
+        if not os.path.exists('static'):
+            os.makedirs('static')
+        # Write the uploaded file to a new file in the static directory
+        with open(os.path.join('static', uploaded_file.name), "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        return True
+    except Exception as e:
+        print(e)
+        return False
+def main():
+    st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
+    if True:
+        if "conversation" not in sst:
+            sst.conversation = None
+        if "chat_history" not in sst:
+            sst.chat_history = None
+        if "page" not in sst:
+            sst.page = "home"
+        if "openai" not in sst:
+            sst.openai = True
+        if "login" not in sst:
+            sst.login = False
+        if 'submitted_user_query' not in sst:
+            sst.submitted_user_query = ''
+        if 'submitted_user_safe' not in sst:
+            sst.submitted_user_safe = ''
+        if 'submitted_user_load' not in sst:
+            sst.submitted_user_load = ''
+        if 'widget_user_load' not in sst:
+            sst.widget_user_load = 'U3_alle'  # Init the vectorstore
+        if 'vectorstore' not in sst:
+            sst.vectorstore = None
+    def submit_user_query():
+        sst.submitted_user_query = sst.widget_user_query
+        sst.widget_user_query = ''
+    def submit_user_safe():
+        sst.submitted_user_safe = sst.widget_user_safe
+        sst.widget_user_safe = ''
+        if sst.vectorstore is not None:
+            my_vectors.save_local(sst.vectorstore, path=sst.submitted_user_safe)
+            st.sidebar.success("saved")
+        else:
+            st.sidebar.warning("No embeddings to save. Please process documents first.")
+    def submit_user_load():
+        sst.submitted_user_load = sst.widget_user_load
+        sst.widget_user_load = ''
+        if os.path.exists(sst.submitted_user_load):
+            new_db = my_vectors.load_local(f"{sst.submitted_user_load}/faiss_index.index")
+            if sst.vectorstore is not None:
+                if new_db is not None:  # Check if this is working
+                    st.sidebar.success("Vectors loaded")
+            else:
+                if new_db is not None:  # Check if this is working
+                    sst.vectorstore = new_db
+                    st.sidebar.success("Vectors loaded")
+        else:
+            st.sidebar.warning("Couldn't load/find embeddings")
+    st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
+    if st.toggle("show README"):
+        st.subheader("Funktion: ")
+        st.write("dieses proof-of-concept von Elia Wäfler demonstriert das Potential von RAG (Retrival Augmented Generation) für BIM2FM Dokumentenablagen am Beispiel Dokumente U3 ASH (Anna Seiler Haus, Inselspital Bern). chatte mit den Dokumenten, oder lade selber ein oder mehrere PDF-Dokumente hoch, um RAG auszuprobieren. die vektoren werden lokal oder im st.session_state gespeichert. Feedback und Bugs gerne an elia.waefler@insel.ch")
+        st.write("Vielen Dank.")
+        st.write("")
+        st.subheader("Licence and credits")
+        st.write("THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.")
+        st.write("special thanks to OpenAI, STREAMLIT, HUGGINGFACE, LANGCHAIN and alejandro-ao")
+        l, r = st.columns(2)
+        with l:
+            st.subheader("Limitationen: ")
+            st.write("bisher nur Text aus PDFs")
+            st.write("macht Fehler, kann falsche Informationen geben")
+            st.write("prompts werden bisher nicht geprüft")
+            st.write("")
+        with r:
+            st.subheader("geplante Erweiterungen:")
+            st.write("Tabellen, Bilder werden auch vektorisiert, um die retrival qualität zu verbessern")
+            st.write("on premise anwendung mit mistral 7b oder vergleichbar")
+            st.write("Ecodomus API einbinden, um alle Dokumente einzubinden.")
+            st.write("")
+    if sst.login:
+        if st.toggle("RAG / classifier"):
+            #user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
+            st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
+            #sst.openai = st.toggle(label="use openai?")
+            if sst.submitted_user_query:
+                if sst.vectorstore is not None:
+                    handle_userinput(sst.submitted_user_query)
+                    sst.submitted_user_query = False
+                else:
+                    st.warning("no vectorstore loaded.")
+            with st.sidebar:
+                st.subheader("Your documents")
+                pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
+                if st.button("Process"):
+                    with st.spinner("Processing"):
+                        vec = ingest.get_text_chunks(ingest.get_pdf_text(pdf_docs))
+                        st.warning("only text")
+                        sst.vectorstore = vec
+                        sst.conversation = vec
+                    st.success("embedding complete")
+                st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
+                              on_change=submit_user_safe)
+                st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
+                              on_change=submit_user_load)
+                if st.toggle("reset vectorstore?"):
+                    if st.button("Yes, reset"):
+                        sst.vectorstore = None
+                        st.warning("vectorstore reset complete")
+                    else:
+                        st.warning("unsaved embeddings will be lost.")
+        else:
+            #vec_store = setup_db.load_vectorstore_from_excel("data/KBOB_Klassifizierung.xlsx")
+            #my_1_writer.safe_my_dict_as_json("data/KBOB_klassen_codes.json", vec_store)
+            vec_store = read_json_file("data/KBOB_klassen_codes.json")
+            sst.page = "home"
+            file = st.file_uploader("upload file", accept_multiple_files=False)
+            if st.button("classify me!"):
+                with st.spinner("Classifying..."):
+                    query_vecs = []
+                    if file.type == "application/pdf":
+                        one, two, three, four, five = st.columns(5)
+                        text = ingest.get_pdf_text(file)
+                        with one:
+                            st.success("text")
+                        # ONE FILE ONLY OR MULTIPLE AT THE SAME TIME?
+                        images = ingest.get_pdf_images(file.getvalue())
+                        if type(images) != list:
+                            images = [images]
+                        for img in images:
+                            text += my_new_openai.img_to_text(img_base64=my_new_openai.image_bytes_to_base64(img))
+                        with two:
+                            st.success("images")
+                        tabs = ingest.get_pdf_tables(file.getvalue())
+                        if type(tabs) != list:
+                            tabs = [tabs]
+                        for tab in tabs:
+                            text += my_new_openai.table_to_text(table=tab)
+                        with three:
+                            st.success("tabs")
+                        # ONE VECTOR PER PDF OR MULTIPLE (CHUNKS IMGS ...) IS THE QUESTION
+                        full_search = my_new_openai.vectorize_data(text)
+                        detail_search = [my_new_openai.vectorize_data(_) for _ in ingest.get_text_chunks(text)]
+                        with four:
+                             st.success("embedded document")
+                        st.write(len(list(vec_store.keys())))
+                        with one:
+                            sorted_vec_table = my_2_sim_search.sim_search_fly(
+                                vec_table=vec_store, term=full_search)
+                            st.write(f"len of list of categories {len(list(sorted_vec_table.keys()))}")
+                            st.write(f"the most fitting category is {next(iter(sorted_vec_table))}")
+                        with two:
+                            sorted_vecs_two = my_2_sim_search.sim_search_fly(
+                                vec_table=read_json_file("vecs/Fachbereiche_vecs.json"), term=full_search)
+                            st.write(f"len of list of categories {len(list(sorted_vecs_two.keys()))}")
+                            st.write(f"the most fitting Fachbereich is {next(iter(sorted_vecs_two))}")
+                        with three:
+                            sorted_vecs_three = my_2_sim_search.sim_search_fly(
+                                vec_table=read_json_file("vecs/SIA-PHASEN 1-5 OUTPUT_vecs.json"), term=full_search)
+                            st.write(f"len of list of categories {len(list(sorted_vecs_three.keys()))}")
+                            st.write(f"the most fitting SIA-Phase is {next(iter(sorted_vecs_three))}")
+                        for vec in detail_search:
+                            pass
+                        with four:
+                            st.success("classification complete")
+                    else:
+                        st.error()
+    else:
+        user_pw = st.text_input("ASK_ASH_PASSWORD: ", type="password")
+        if st.button("check"):
+            time.sleep(0.5)
+            if user_pw == ASK_ASH_PASSWORD:
+                sst.login = True
+                if "first_load" not in sst:
+                    submit_user_load()
+                    sst.first_load = True
+                    st.rerun()
+if __name__ == '__main__':
+    if True:
+        OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
+        OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
+        HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
+        sst = st.session_state
+        ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
+    main()

ingest.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from PyPDF2 import PdfReader
+from langchain.text_splitter import CharacterTextSplitter
+import tabula
+import io
+import fitz  # PyMuPDF
+import pdfplumber
+from PIL import Image
+import io
+def get_pdf_tables(pdf_bytes):
+    """
+    Extracts tables from a PDF file loaded directly from bytes.
+    Args:
+    pdf_bytes (bytes): The byte content of the PDF file.
+    Returns:
+    List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
+    """
+    tables = []
+    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+        for page in pdf.pages:
+            # Extract tables from the current page
+            page_tables = page.extract_tables()
+            for table in page_tables:
+                # Convert table to a DataFrame and append to the list
+                tables.append(table)
+    # Optionally convert lists of lists (tables) to pandas DataFrames
+    import pandas as pd
+    dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]
+    return dataframes
+def get_pdf_images(pdf_bytes):
+    """
+    Extracts images and captures screenshots of each page from a given PDF's bytes.
+    Args:
+    pdf_bytes (bytes): The byte content of the PDF file.
+    Returns:
+    List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page.
+    """
+    images = []
+    pdf_stream = io.BytesIO(pdf_bytes)
+    doc = fitz.open("pdf", pdf_stream.read())
+    for page_num, page in enumerate(doc):
+        # Take a screenshot of the current page
+        pix = page.get_pixmap()  # This line captures the page as an image
+        img_bytes = pix.tobytes("png")  # Save the pixmap as PNG bytes
+        images.append(img_bytes)  # Append the screenshot to the list of images
+        # Extract embedded images
+        for img_index, img in enumerate(page.get_images(full=True)):
+            xref = img[0]
+            base_image = doc.extract_image(xref)
+            image_bytes = base_image["image"]
+            images.append(image_bytes)
+    doc.close()
+    return images
+def get_pdf_old_tables(pdf_bytes):
+    """
+    Extracts tables from a given PDF's bytes using Tabula.
+    Args:
+    pdf_bytes (bytes): The byte content of the PDF file.
+    Returns:
+    List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
+    """
+    pdf_stream = io.BytesIO(pdf_bytes)
+    # Read PDF into list of DataFrame
+    tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True)
+    return tables
+def get_pdf_text(pdf_docs):
+    text = ""
+    if type(pdf_docs) == list:
+        for pdf in pdf_docs:
+            pdf_reader = PdfReader(pdf)
+            for page in pdf_reader.pages:
+                text += page.extract_text()
+    else:
+        pdf_reader = PdfReader(pdf_docs)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    return text
+def get_text_chunks(text):
+    text_splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+def extract_images_from_pdf_path(pdf_path):
+    doc = fitz.open(pdf_path)
+    images = []
+    for i in range(len(doc)):
+        for img in doc.get_page_images(i):
+            xref = img[0]
+            base = img[1]
+            img_data = doc.extract_image(xref)
+            img_bytes = img_data['image']
+            image = Image.open(io.BytesIO(img_bytes))
+            images.append(image)
+    return images
+def get_tables_from_pdf_path(pdf_path):
+    # read_pdf will save the pdf table into Pandas Dataframe
+    tables = tabula.read_pdf(pdf_path, pages='all')
+    return tables

kbob_file_handler.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import pandas as pd
+from typing import Any
+from typing import List
+import my_new_openai
+import my_1_writer
+def get_row_names(file_path: str, sheet_name="Block5", header=4, index_col=7) -> List[str]:
+    """
+    Diese Funktion liest alle Zeilennamen aus der angegebenen Excel-Datei und gibt sie als Liste von Zeichenketten zurück.
+    Argumente:
+    file_path (str): Der Dateipfad zur Excel-Datei.
+    Rückgabewert:
+    List[str]: Eine Liste von Zeilennamen als Zeichenketten.
+    """
+    try:
+        df = pd.read_excel(file_path, sheet_name=sheet_name, header=header, index_col=index_col)
+        return df.index.astype(str).tolist()
+    except Exception as e:
+        return [str(e)]
+def get_col_names(file_path: str, sheet_name="Block5", header=4, index_col=7) -> List[str]:
+    """
+    Diese Funktion liest alle Spaltennamen aus der angegebenen Excel-Datei und gibt sie als Liste von Zeichenketten zurück.
+    Argumente:
+    file_path (str): Der Dateipfad zur Excel-Datei.
+    Rückgabewert:
+    List[str]: Eine Liste von Spaltennamen als Zeichenketten.
+    """
+    try:
+        df = pd.read_excel(file_path, sheet_name=sheet_name, header=header, index_col=index_col)
+        return df.columns.astype(str).tolist()
+    except Exception as e:
+        return [str(e)]
+def get_cell_value(file_path: str, row_name: str, col_name: str, sheet_name="Block5") -> str:
+    """
+    Diese Funktion gibt den Wert der Zelle als Zeichenkette zurück, in der der angegebene Zeilenname und Spaltenname
+    zuerst gefunden wurden. Falls der Zeilenname oder Spaltenname nicht existiert, wird eine entsprechende
+    Fehlermeldung zurückgegeben.
+    Argumente:
+    file_path (str): Der Dateipfad zur Excel-Datei.
+    row_name (str): Der Name der Zeile.
+    col_name (str): Der Name der Spalte.
+    Rückgabewert:
+    str: Der Wert der Zelle als Zeichenkette, falls vorhanden. Andernfalls eine Fehlermeldung.
+    """
+    try:
+        df = pd.read_excel(file_path, sheet_name=sheet_name, header=4, index_col=8)
+        if row_name not in df.index:
+            return f"Zeilenname '{row_name}' nicht im DataFrame gefunden."
+        if col_name not in df.columns:
+            return f"Spaltenname '{col_name}' nicht im DataFrame gefunden."
+        return str(df.at[row_name, col_name])
+    except Exception as e:
+        return str(e)
+def get_header_dict():
+    file_path = 'data/kbob.xlsx'
+    # print(get_cell_value(file_path, 'sdlöfjasko', 'Kältetechnik (D06)'))
+    rows = get_row_names(file_path)
+    cols = get_col_names(file_path)
+    colheads = get_col_names(file_path, header=3)
+    head = ""
+    my_cols = {}
+    for e in colheads:
+        if "Unnamed" not in e:
+            head = e
+        else:
+            colheads[colheads.index(e)] = head
+    print(colheads)
+    print(cols)
+    print(len(colheads), len(cols))
+    for i in range(len(colheads)):
+        if "Unnamed" not in cols[i]:
+            if colheads[i] not in list(my_cols.keys()):
+                my_cols[colheads[i]] = [cols[i]]
+            else:
+                my_cols[colheads[i]].append(cols[i])
+    print(my_cols)
+    print((len(list(my_cols.keys()))))
+    return my_cols
+def create_kbob_vectors(headers: dict):
+    for e in list(headers.keys()):
+        header_list = [e + ": " + headers[e][_] for _ in range(len(headers[e]))]
+        print(f"{e}: {header_list}")
+        print()
+        vecs = my_new_openai.vectorize_data(header_list)
+        my_1_writer.safe_my_dict_as_json(f"vecs/{e}_vecs.json", vecs)
+if __name__ == '__main__':
+    h = get_header_dict()
+    print(h.keys())
+    #create_kbob_vectors(h)

my_1_reader.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# MUSS AUFGERÄUMT WERDEN
+import json
+import os
+import subprocess
+import PyPDF2
+import csv
+import fitz  # PyMuPDF
+def extract_text_from_pdf(pdf_path):
+    """
+    Extracts all text from a PDF file.
+    :param pdf_path: Path to the PDF file.
+    :return: Extracted text as a string.
+    """
+    # Open the PDF file
+    doc = fitz.open(pdf_path)
+    # Initialize an empty string to hold the text
+    text = ''
+    # Iterate through each page in the PDF
+    for page_num in range(len(doc)):
+        # Get a page
+        page = doc.load_page(page_num)
+        # Extract text from the page and add it to the result
+        text += page.get_text()
+    # Close the document
+    doc.close()
+    return text
+def read_pdfs_from_folder(folder_path):
+    """
+    Reads all PDF files in the specified folder using PdfReader and extracts their text.
+    Parameters:
+    - folder_path: The path to the folder containing PDF files.
+    Returns:
+    - A dictionary with file names as keys and their extracted text as values.
+    """
+    pdf_texts = {}
+    for filename in os.listdir(folder_path):
+        if filename.endswith('.pdf'):
+            file_path = os.path.join(folder_path, filename)
+            with open(file_path, 'rb') as pdf_file:
+                pdf_reader = PyPDF2.PdfReader(pdf_file)
+                text = ''
+                for page in pdf_reader.pages:
+                    try:
+                        text += page.extract_text()
+                    except UnicodeDecodeError as e:
+                        print(e)
+                for c in text:
+                    if c in ["ä", "Ä"]:
+                        text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
+                    if c in ["ö", "Ö"]:
+                        text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
+                    if c in ["ü", "Ü"]:
+                        text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
+                    if c in [",", ";", "\\", '"']:
+                        text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
+                    if c in ["/n", "\n"]:
+                        text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
+                pdf_texts[filename] = text
+    return pdf_texts
+def read_csv_lines_as_strings(filename):
+    """
+    Opens a CSV file and returns each line as a string in a list.
+    Parameters:
+    - filename: The path to the CSV file.
+    Returns:
+    - A list of strings, each representing a line from the CSV file.
+    """
+    lines_as_strings = []
+    with open(filename, newline='') as csvfile:
+        try:
+            reader = csv.reader(csvfile)
+            for row in reader:
+                # Convert the row (a list of values) back into a comma-separated string
+                line_as_string = ','.join(row)
+                lines_as_strings.append(line_as_string)
+        except UnicodeDecodeError as e:
+            print(e)
+    return lines_as_strings
+# Function to load data from JSON files
+def load_data(filename):
+    with open(filename, 'r') as file:
+        try:
+            return json.load(file)
+        except UnicodeDecodeError as err:
+            print(err)
+            return {}
+def find_and_open_file(filename, start_directory):
+    """
+    Attempts to open a file with the given filename starting from the specified directory.
+    If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
+    """
+    for root, dirs, files in os.walk(start_directory):
+        if filename in files:
+            filepath = os.path.join(root, filename)
+            print(f"File found: {filepath}")
+            return filepath
+    print(f"File {filename} not found.")
+    return None
+def open_file(filepath):
+    """
+    Opens the file with the default application, based on the operating system.
+    """
+    if os.path.exists(filepath):
+        if os.name == 'posix':  # Linux, macOS, etc.
+            subprocess.call(('open', filepath))
+        elif os.name == 'nt':  # Windows
+            os.startfile(filepath)
+        else:
+            print(f"Cannot open file on this operating system: {filepath}")
+    else:
+        print(f"File does not exist: {filepath}")
+def list_folders_files_recursive(path, depth=0):
+    """
+    Recursively lists all folders and files within the specified path, including subfolders.
+    Parameters:
+    - path: The directory path to list contents from.
+    - depth: The current depth of recursion (used for indentation in print statements).
+    Returns:
+    - None
+    """
+    # Ensure the provided path is a directory
+    if not os.path.isdir(path):
+        print(f"The provided path '{path}' is not a valid directory.")
+        return
+    indent = '  ' * depth  # Indentation based on recursion depth
+    folders, files = [], []
+    # List all entries in the directory
+    for entry in os.listdir(path):
+        full_path = os.path.join(path, entry)
+        if os.path.isdir(full_path):
+            folders.append(entry)
+            print(f"{indent}Folder: {entry}")
+            # Recursively list subfolders and files
+            list_folders_files_recursive(full_path, depth + 1)
+        elif os.path.isfile(full_path):
+            files.append(entry)
+    for f in files:
+        print(f"{indent}File: {f}")
+def list_folders_files(path):
+    """
+    Lists all folders and files within the specified path.
+    Parameters:
+    - path: The directory path to list contents from.
+    Returns:
+    - A tuple of two lists: (folders, files).
+    """
+    folders = []
+    files = []
+    # Ensure the provided path is a directory
+    if not os.path.isdir(path):
+        print(f"The provided path '{path}' is not a valid directory.")
+        return folders, files
+    # List all entries in the directory
+    for entry in os.listdir(path):
+        full_path = os.path.join(path, entry)
+        if os.path.isdir(full_path):
+            folders.append(entry)
+        elif os.path.isfile(full_path):
+            files.append(entry)
+    return folders, files
+if __name__ == "__main__":
+    print("here are all functions that read files")

my_1_writer.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# MUSS AUFGERÄUMT WERDEN
+import json
+import pandas as pd
+def split_json_file(input_filepath, lines_per_file=50):
+    """
+    Splits a JSON file into multiple files, each containing up to 'lines_per_file' lines.
+    param input_filepath: The path to the input JSON file.
+    param lines_per_file: The maximum number of lines per output file.
+    """
+    # Counter for file naming
+    file_counter = 1
+    # Open the input file
+    with open(input_filepath, 'r') as input_file:
+        # Read the lines from the input file
+        lines = input_file.readlines()
+        # Iterate through the lines in chunks of 'lines_per_file'
+        for i in range(0, len(lines), lines_per_file):
+            # Determine the output file name
+            output_filename = f'translate_data/english_{file_counter}.json'
+            # Write the current chunk to the output file
+            with open(output_filename, 'w') as output_file:
+                # Grab the current chunk of lines
+                chunk = lines[i:i+lines_per_file]
+                # Write each line to the output file
+                for line in chunk:
+                    output_file.write(line)
+            print(f'Created {output_filename}')
+            # Increment the file counter
+            file_counter += 1
+def merge_and_save(list1, list2, dict1, dict2, filename='output.csv'):
+    """
+    Merges two lists and two dictionaries into a pandas DataFrame according to the specified structure:
+    headers: ['list1', 'list2', 'keys dict1', 'vals dict1', 'keys dict2', 'vals dict2']
+    and saves it as a CSV file.
+    Parameters:
+    - list1 (list): First list to merge, contributing to column 'list1'.
+    - list2 (list): Second list to merge, contributing to column 'list2'.
+    - dict1 (dict): First dictionary to merge, keys and values added as separate columns.
+    - dict2 (dict): Second dictionary to merge, keys and values added as separate columns.
+    - filename (str): Filename for the saved CSV file.
+    """
+    # Combining all elements into a structured list of dictionaries for DataFrame construction
+    data = []
+    dict1_items = list(dict1.items())
+    dict2_items = list(dict2.items())
+    for i in range(len(list1)):
+        row = {
+            'list1': list1[i],
+            'list2': list2[i],
+            'keys dict1': dict1_items[i][0],
+            'vals dict1': dict1_items[i][1],
+            'keys dict2': dict2_items[i][0],
+            'vals dict2': dict2_items[i][1]
+        }
+        data.append(row)
+    # Creating the DataFrame
+    df = pd.DataFrame(data)
+    # Saving the DataFrame to a CSV file
+    df.to_csv(filename, index=False)
+    print(f"DataFrame saved as '{filename}' in the current directory.")
+# new line for every entry
+def safe_my_dict_as_json(file_name, my_dict):
+    print(my_dict)
+    # Open a file for writing
+    with open(file_name, 'w') as f:
+        # Write the opening brace of the JSON object
+        f.write('{\n')
+        # Get total number of items to control comma insertion
+        total_items = len(my_dict)
+        if type(my_dict) == list:
+            my_dict = my_dict[0]
+        # Iterate over items, keeping track of the current item index
+        for i, (key, value) in enumerate(my_dict.items()):
+            # Serialize the key with JSON to handle special characters and ensure proper quoting
+            json_key = json.dumps(key)
+            # Convert the list to a JSON-formatted string (without indentation)
+            json_value = json.dumps(value)
+            # Determine if a comma is needed (for all but the last item)
+            comma = ',' if i < total_items - 1 else ''
+            # Write the formatted string to the file
+            f.write(f"    {json_key}: {json_value}{comma}\n")
+        # Write the closing brace of the JSON object
+        f.write('}\n')
+if __name__ == "__main__":
+    print("here are all functions that write to the Datasets")

my_2_sim_search.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import my_new_openai
+import my_1_writer
+import json
+import numpy as np
+# sim search with dot_product and lin_distance
+# the newly vectorized TERM will be added to the database
+# database = .json file
+def sim_search_load_db(database, term, add_to_db=True, debug=False):
+    if type(term) == str:
+        print("str")
+        vector1 = my_new_openai.vectorize_data(term)
+    elif type(term) == list:
+        print("list")
+        vector1 = term
+    else:
+        print("invalid search_term/search_vector format")
+        return
+    with open(database, "r") as f:
+        table = json.load(f)
+    sim_search_dict = {}
+    for key in table.keys():
+        vector2 = table[key]
+        if debug:
+            print("")
+            print(f"{vector1}")
+            print(f"{vector2}")
+            print(f"doing dot product for {key} and {term}")
+        dp = np.dot(vector1, vector2)
+        distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
+        if debug:
+            print(f"the dp is {dp}")
+            print(f"the distance is{distance}")
+            print("")
+            print("")
+            print("")
+        sim_search_dict[key] = dp * distance
+    # sort with the biggest similarity
+    sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True))
+    if debug:
+        for key, value in sorted_table[:5]:
+            print(f"{key}: {value}")
+    if add_to_db:
+        if term in table.keys():
+            print("the search term is in the database!")
+            # add the newly vectorized term to the words, if not already in the vector table
+        else:
+            if database != "session/my_words_vec_table.json":
+                database = "session/my_vecs.json"
+                # table = load_df(database)  # ??
+            table[str(term)] = vector1
+            my_1_writer.safe_my_dict_as_json(database, table)
+    # first_key, first_value = list(sortedTable.items())[0]
+    print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
+    return sorted_table
+def dot_p_to_1(database, vector1=0, analysis_filename=0):
+    with open(database, "r") as f:
+        table = json.load(f)
+    dot_product_to1 = {}
+    if vector1 == 0:
+        vector1 = [0.025515518153991442 for _ in range(1536)]
+    elif vector1 == 1:
+        vector1 = table[str(list(table.keys())[0])]
+    for key in table.keys():
+        dot_product_to1[key] = np.dot(vector1, table[key])
+    my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
+    print("dot p to 1 saved")
+def lin_dist(database, vector1=0, analysis_filename=0):
+    with open(database, "r") as f:
+        table = json.load(f)
+    lin_dist_to_1 = {}
+    if vector1 == 0:
+        vector1 = [0.025515518153991442 for _ in range(1536)]
+    elif vector1 == 1:
+        vector1 = table[str(list(table.keys())[0])]
+    for key in table.keys():
+        lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
+    my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
+    print("lin dist to 1 saved")
+def manhattan_dist(database, vector1=0, analysis_filename=0):
+    with open(database, "r") as f:
+        table = json.load(f)
+    manhattan_dist_to_1 = {}
+    if vector1 == 0:
+        vector1 = [0.025515518153991442 for _ in range(1536)]
+    elif vector1 == 1:
+        vector1 = table[str(list(table.keys())[0])]
+    for key in table.keys():
+        manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
+    my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
+    print("manhattan dist to 1 saved")
+#vec_table
+def sim_search_fly(vec_table, term, debug=False):
+    if debug:
+        print(type(vec_table))
+        print(type(term))
+        print(type(vec_table[list(vec_table.keys())[0]]))
+        print("vec table:")
+    print(vec_table[list(vec_table.keys())[5]][:4])
+    print("search term")
+    print(term[:4])
+    if type(term) == str:
+        print("str")
+        vector1 = my_new_openai.vectorize_data(term)
+    elif type(term) == list:
+        print("list")
+        vector1 = term
+    else:
+        print("invalid search_term/search_vector format")
+        return
+    sim_search_dict = {}
+    for key in vec_table.keys():
+        vector2 = vec_table[key]
+        if debug:
+            print("")
+            print(f"{vector1}")
+            print(f"{vector2}")
+            print(f"doing dot product for {key} and {term}")
+        if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
+            dp = 200
+        else:
+            dp = np.dot(vector1, vector2)
+        #distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
+        if debug:
+            print(f"the dp is {dp}")
+            #print(f"the distance is{distance}")
+            print("")
+            print("")
+            print("")
+        sim_search_dict[key] = dp #* distance
+    # sort with the biggest similarity
+    sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True))
+    if debug:
+        for key, value in sorted_table[:5]:
+            print(f"{key}: {value}")
+    # first_key, first_value = list(sortedTable.items())[0]
+    print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
+    return sorted_table

my_new_openai.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#nicht aktuell
+import os
+from openai import OpenAI
+import requests
+import base64
+client = OpenAI()
+def image_bytes_to_base64(image_bytes):
+    """
+    Converts an image from bytes to a Base64 encoded string.
+    Args:
+    image_bytes (bytes): Byte content of the image.
+    Returns:
+    str: A Base64 encoded string of the image.
+    """
+    return base64.b64encode(image_bytes).decode('utf-8')
+def image_to_base64(image_path):
+    with open(image_path, "rb") as image_file:
+        return str(base64.b64encode(image_file.read()).decode('utf-8'))
+def gpt4_new(prompt_text):
+    gpt_response = client.chat.completions.create(
+        model="gpt-4",
+        messages=[{"role": "system",
+                   "content":   "Du bist eine Maschine, die Dokumente klassifiziert."},
+                  {"role": "user", "content": prompt_text}])
+    return gpt_response.choices[0].message.content
+def vectorize_data(data_input):
+    # input can be list or string:
+    if isinstance(data_input, list):
+        # returning a dictionary
+        my_dict = {}
+        for item in data_input:
+            my_dict[str(item)] = client.embeddings.create(input=data_input,
+                                                          model="text-embedding-ada-002").data[0].embedding
+        return my_dict
+    elif isinstance(data_input, str):
+        # returning just the vector
+        return client.embeddings.create(input=data_input, model="text-embedding-ada-002").data[0].embedding
+    else:
+        print("none")
+def img_create(prompt="a nice house on the beach", download_path=""):
+    # to open, must download
+    my_url = client.images.generate(model="dall-e-3", prompt=prompt, size="1024x1024").data[0].url
+    if download_path:
+        my_image = requests.get(my_url)
+        if my_image.status_code == 200:
+            with open(download_path, 'wb') as f:
+                f.write(my_image.content)
+        else:
+            print("Failed to retrieve image")
+    return my_url
+def img_to_text(img_url="", img_base64="", prompt="What’s in this image?", print_out=True):
+    if img_url:
+        img_desc_response = client.chat.completions.create(
+            model="gpt-4-turbo",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": img_url,
+                            },
+                        },
+                    ],
+                }
+            ],
+            max_tokens=500,
+        )
+        if print_out:
+            print(img_desc_response.choices[0].message.content)
+        return img_desc_response.choices[0].message.content
+    elif img_base64:
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
+        }
+        payload = {
+            "model": "gpt-4-turbo",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{img_base64}"
+                            }
+                        }
+                    ]
+                }
+            ],
+            "max_tokens": 300
+        }
+        img_desc_response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+        if print_out:
+            print(img_desc_response.json()["choices"][0]["message"]["content"])
+        return img_desc_response.json()["choices"][0]["message"]["content"]
+    else:
+        return ValueError
+def encode_image_to_base64(image_path):
+    with open(image_path, "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
+    return encoded_string
+def table_to_text(table=None, prompt="describe this table in plain text. "
+                   "be as precise as possible. spare no detail. "
+                   "what is in this table?", print_out=True):
+    if table is not None:
+        response = gpt4_new(f"{prompt} TABLE: {table}")
+        if print_out:
+            print(response)
+        return response
+    else:
+        return ValueError
+if __name__ == "__main__":
+    #print("here are all functions that directly call openai.")
+    #img_create("a skier in the swiss alps", download_path="skier.png")
+    #img_to_text(img_base64=encode_image_to_base64("skier.png"))
+    #print(image_to_base64("skier.png"))
+    #print(vectorize_data("test string"))
+    print(gpt4_new())

my_vectors.py ADDED Viewed

	@@ -0,0 +1,17 @@

+def safe_local(vectors, path):
+    pass
+def merge_two(vec1, vec2):
+    pass
+def load_local(path):
+    pass
+if __name__ == "__main__":
+    print("you are in the my_vectors")

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+streamlit~=1.33.0
+bcrypt~=4.1.2
+psycopg2-binary~=2.9.9
+openai~=1.23.2
+pypdf2~=3.0.1
+langchain~=0.1.16
+tiktoken~=0.6.0
+numpy~=1.26.4
+requests~=2.31.0
+pandas~=2.2.2
+tabula~=1.0.5
+pdfplumber~=0.11.0
+PyMuPDF~=1.24.3
+pillow~=10.3.0
+openpyxl~=3.1.2

setup_db.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import time
+import openpyxl
+import my_new_openai
+def update_excel_with_sums(filename):
+    # Load the workbook and select the active worksheet
+    workbook = openpyxl.load_workbook(filename)
+    sheet = workbook.active
+    # Iterate through each row in the sheet
+    for row in sheet.iter_rows(min_row=1, min_col=2, max_col=3):
+        Bn, Cn = row  # Assuming B and C are columns 2 and 3 respectively
+        vector = my_new_openai.vectorize_data(f"{Bn.value}: {Cn.value}") if Bn.value and Cn.value else 0
+        if vector != 0:
+            for val in vector:
+                sheet.cell(row=Bn.row, column=4+vector.index(val)).value = val
+    # Save the workbook
+    workbook.save(filename)
+    print(f"Updated the file '{filename}' with vectors in column D.")
+def load_vectorstore_from_excel(filename):
+    # returns a dictonary
+    # Load the workbook and select the active worksheet
+    workbook = openpyxl.load_workbook(filename)
+    sheet = workbook.active
+    # Iterate through each row in the sheet
+    vec_store = {}
+    for row in range(3, 634):
+        vec = []
+        for col in range(0, 1536):
+            val = sheet.cell(row=row, column=4+col).value
+            vec.append(val)
+        vec_store[str(sheet.cell(row=row, column=1).value)] = vec
+    return vec_store
+if __name__ == '__main__':
+    #update_excel_with_sums("KBOB_Klassifizierung.xlsx")
+    t = time.time()
+    vec_store = load_vectorstore_from_excel("data/KBOB_Klassifizierung.xlsx")
+    print(time.time()-t)
+    for e in vec_store.keys():
+        print(f"{e}: {vec_store[e][0]}, {vec_store[e][1]}, .... {vec_store[e][-1]}")