Spaces:
Sleeping
Sleeping
import time | |
import streamlit as st | |
import os | |
# import openai | |
from PyPDF2 import PdfReader | |
from openai import OpenAI | |
from langchain.chat_models import ChatOpenAI | |
ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"] | |
def gpt4_new(prompt_text): | |
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY')) | |
response = client.chat.completions.create( | |
model="gpt-4", | |
messages=[{"role": "system", | |
"content": "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten," | |
"das Dokument in vorgegebene Kategorien klassifiziert." | |
"Du gibts möglichst kurze Antworten, am besten ein Wort" | |
"Du gibst keine Erklärungen oder Begründungen. " | |
"Du klassifizierst nur nach den vorgegebenen Kategorien." | |
"Wenn ein Dokument partout nicht klassifizierbar ist, " | |
"antwortest du mit '<no classification>'"}, | |
{"role": "user", "content": prompt_text}]) | |
return response.choices[0].message.content | |
# nicht aktuell | |
def ask_gpt4(question): | |
print(question) # we don't have to submit the question? | |
try: | |
# Use the chat function to send a message and get a response | |
response = ChatOpenAI() | |
# Extract the response text | |
return response["choices"][0]["message"]["content"] | |
except Exception as e: | |
# Handle exceptions that may occur during the API call | |
return str(e) | |
def process_prompts_and_save(my_prompts): | |
# Ensure the responses list is empty initially | |
responses = [] | |
# Loop through each prompt in the list | |
for prompt in my_prompts: | |
try: | |
# ADD LOGIC TO READ FILE AND CLASSIFY | |
# Generate response for each prompt and append to the list | |
response = ask_gpt4(prompt) | |
sol = f"{prompt}\n\n{response}\n\n\n\n" | |
print(sol) | |
responses.append(sol) | |
except Exception as e: | |
# In case of an error, log the error with the prompt | |
responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n") | |
# Writing all responses to a text file | |
with open('gpt4_responses.txt', 'w', encoding='utf-8') as file: | |
file.writelines(responses) | |
def get_pdfs_text(pdf_docs): | |
text = "" | |
for pdf in pdf_docs: | |
pdf_reader = PdfReader(pdf) | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
def get_pdf_text(pdf_document): | |
text = "" | |
pdf_reader = PdfReader(pdf_document) | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
def json_open(filename): | |
with open(filename, "r") as f: | |
mydata = f.read() | |
return mydata | |
def main(): | |
st.title("Doc Classifier") | |
if st.toggle("show README"): | |
st.subheader("Funktion: ") | |
st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren. lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren. Feedback und Bugs gerne an elia.waefler@insel.ch") | |
st.write("Vielen Dank.") | |
st.write("") | |
st.subheader("Licence and credits") | |
st.write("THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.") | |
st.write("special thanks to OpenAI, Huggingface, Streamlit") | |
l, r = st.columns(2) | |
with l: | |
st.subheader("Limitationen: ") | |
st.write("bisher nur PDFs") | |
st.write("nur Disziplin, Doc typ. und Geschoss") | |
st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)") | |
st.write("") | |
with r: | |
st.subheader("geplante Erweiterungen:") | |
st.write("Text Beschreibung wird von AI hinzugefügt") | |
st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text") | |
st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.") | |
if "login" not in st.session_state: | |
st.session_state.login = False | |
if st.session_state.login: | |
uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True) | |
# print(uploaded_file) | |
# print(uploaded_file.name) | |
if st.button("classify KBOB!"): | |
if uploaded_files is not None: | |
with st.container(): | |
# col1, col2, col3, col4, col5 = st.columns(5) | |
col1, col2, col3 = st.columns(3) | |
all_metadata = [] | |
with col1: | |
st.write("Disziplin") | |
st.write(f"") | |
with col2: | |
st.write("Dokumententyp") | |
st.write(f"") | |
with col3: | |
st.write("Geschoss") | |
st.write(f"") | |
for file in uploaded_files: | |
metadata = [] | |
metadata.append(str(file.name)) | |
with col1: | |
with st.spinner("GPT4 at work"): | |
pdf_text = str(get_pdf_text(file)) | |
prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text | |
try: | |
answer_1 = gpt4_new(prompt_1) | |
except: | |
answer_1 = "<err_no_classification>" | |
print(prompt_1) | |
metadata.append(str(answer_1)) | |
st.write(answer_1) | |
with col2: | |
with st.spinner("GPT4 at work"): | |
prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text | |
try: | |
answer_2 = gpt4_new(prompt_2) | |
except: | |
answer_2 = "<err_no_classification>" | |
print(prompt_2) | |
metadata.append(str(answer_2)) | |
st.write(answer_2) | |
with col3: | |
with st.spinner("GPT4 at work"): | |
prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text | |
try: | |
answer_3 = gpt4_new(prompt_3) | |
except: | |
answer_3 = "<err_no_classification>" | |
print(prompt_3) | |
metadata.append(str(answer_3)) | |
st.write(answer_3) | |
all_metadata.append(metadata) | |
metadata_filename = "ai_generated_metadata.txt" | |
with open(metadata_filename, 'w', encoding='utf-8') as f: | |
for line in all_metadata: | |
f.writelines("\n") | |
for item in line: | |
f.writelines(item) | |
f.writelines(";") | |
f.writelines("\n") | |
st.success("classified, saved") | |
st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename) | |
else: | |
st.warning("no file") | |
else: | |
user_pw = st.text_input("ASK_ASH_PASSWORD: ", type="password") | |
if st.button("check"): | |
time.sleep(0.5) | |
if user_pw == ASK_ASH_PASSWORD: | |
st.session_state.login = True | |
st.rerun() | |
if __name__ == "__main__": | |
#prompts = ["classify the document, tell me the ", "hello"] | |
#process_prompts_and_save(prompts) | |
auftrag_0 = "Klassifiziere dieses Dokument nach " | |
auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': " | |
auftrag_1_type = "diesen 'Dokumententypen': " | |
auftrag_1_ge = "diesen 'Geschossen': " | |
Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen', | |
'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung', | |
'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler', | |
'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik', | |
'Z-Lichtplanung'] | |
auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \ | |
"Keine weiteren Ausführungen oder Erklärungen. " \ | |
"Antworte am besten in einem Wort. " \ | |
"Hier der Dokumenteninhalt: " | |
Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)', | |
'Prozessdokumentation', 'Fachdokumentation', 'Anlagedokumentation'] | |
ASH_Geschosse = ['U4', 'U3', 'U2', 'U1', | |
'A', 'B', 'C', 'D', 'E', 'F', 'G'] | |
#print(str(Baubranchen_Disziplinen)) | |
main() | |