Spaces:

elia-waefler
/

classify_ASH

Sleeping

App Files Files Community

classify_ASH / app.py

elia-waefler

gpt funktion

9d007c3 9 months ago

raw

history blame contribute delete

9.75 kB

	import time
	import streamlit as st
	import os
	# import openai
	from PyPDF2 import PdfReader
	from openai import OpenAI
	from langchain.chat_models import ChatOpenAI
	ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]


	def gpt4_new(prompt_text):
	client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
	response = client.chat.completions.create(
	model="gpt-4",
	messages=[{"role": "system",
	"content": "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
	"das Dokument in vorgegebene Kategorien klassifiziert."
	"Du gibts möglichst kurze Antworten, am besten ein Wort"
	"Du gibst keine Erklärungen oder Begründungen. "
	"Du klassifizierst nur nach den vorgegebenen Kategorien."
	"Wenn ein Dokument partout nicht klassifizierbar ist, "
	"antwortest du mit '<no classification>'"},
	{"role": "user", "content": prompt_text}])
	return response.choices[0].message.content


	# nicht aktuell
	def ask_gpt4(question):
	print(question) # we don't have to submit the question?
	try:
	# Use the chat function to send a message and get a response
	response = ChatOpenAI()
	# Extract the response text
	return response["choices"][0]["message"]["content"]
	except Exception as e:
	# Handle exceptions that may occur during the API call
	return str(e)


	def process_prompts_and_save(my_prompts):
	# Ensure the responses list is empty initially
	responses = []

	# Loop through each prompt in the list
	for prompt in my_prompts:
	try:
	# ADD LOGIC TO READ FILE AND CLASSIFY
	# Generate response for each prompt and append to the list
	response = ask_gpt4(prompt)
	sol = f"{prompt}\n\n{response}\n\n\n\n"
	print(sol)
	responses.append(sol)
	except Exception as e:
	# In case of an error, log the error with the prompt
	responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")

	# Writing all responses to a text file
	with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
	file.writelines(responses)


	def get_pdfs_text(pdf_docs):
	text = ""
	for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text


	def get_pdf_text(pdf_document):
	text = ""
	pdf_reader = PdfReader(pdf_document)
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text


	def json_open(filename):
	with open(filename, "r") as f:
	mydata = f.read()
	return mydata


	def main():
	st.title("Doc Classifier")
	if st.toggle("show README"):
	st.subheader("Funktion: ")
	st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren. lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren. Feedback und Bugs gerne an elia.waefler@insel.ch")
	st.write("Vielen Dank.")
	st.write("")
	st.subheader("Licence and credits")
	st.write("THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.")
	st.write("special thanks to OpenAI, Huggingface, Streamlit")
	l, r = st.columns(2)
	with l:
	st.subheader("Limitationen: ")
	st.write("bisher nur PDFs")
	st.write("nur Disziplin, Doc typ. und Geschoss")
	st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
	st.write("")
	with r:
	st.subheader("geplante Erweiterungen:")
	st.write("Text Beschreibung wird von AI hinzugefügt")
	st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
	st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
	if "login" not in st.session_state:
	st.session_state.login = False

	if st.session_state.login:
	uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)

	# print(uploaded_file)
	# print(uploaded_file.name)
	if st.button("classify KBOB!"):
	if uploaded_files is not None:
	with st.container():
	# col1, col2, col3, col4, col5 = st.columns(5)
	col1, col2, col3 = st.columns(3)
	all_metadata = []
	with col1:
	st.write("Disziplin")
	st.write(f"")
	with col2:
	st.write("Dokumententyp")
	st.write(f"")
	with col3:
	st.write("Geschoss")
	st.write(f"")
	for file in uploaded_files:
	metadata = []
	metadata.append(str(file.name))
	with col1:
	with st.spinner("GPT4 at work"):
	pdf_text = str(get_pdf_text(file))
	prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
	try:
	answer_1 = gpt4_new(prompt_1)
	except:
	answer_1 = "<err_no_classification>"
	print(prompt_1)
	metadata.append(str(answer_1))
	st.write(answer_1)
	with col2:
	with st.spinner("GPT4 at work"):
	prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
	try:
	answer_2 = gpt4_new(prompt_2)
	except:
	answer_2 = "<err_no_classification>"
	print(prompt_2)
	metadata.append(str(answer_2))

	st.write(answer_2)
	with col3:
	with st.spinner("GPT4 at work"):
	prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
	try:
	answer_3 = gpt4_new(prompt_3)
	except:
	answer_3 = "<err_no_classification>"
	print(prompt_3)
	metadata.append(str(answer_3))

	st.write(answer_3)
	all_metadata.append(metadata)

	metadata_filename = "ai_generated_metadata.txt"
	with open(metadata_filename, 'w', encoding='utf-8') as f:
	for line in all_metadata:
	f.writelines("\n")
	for item in line:
	f.writelines(item)
	f.writelines(";")

	f.writelines("\n")

	st.success("classified, saved")
	st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
	else:
	st.warning("no file")

	else:
	user_pw = st.text_input("ASK_ASH_PASSWORD: ", type="password")
	if st.button("check"):
	time.sleep(0.5)
	if user_pw == ASK_ASH_PASSWORD:
	st.session_state.login = True
	st.rerun()


	if __name__ == "__main__":
	#prompts = ["classify the document, tell me the ", "hello"]
	#process_prompts_and_save(prompts)
	auftrag_0 = "Klassifiziere dieses Dokument nach "
	auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
	auftrag_1_type = "diesen 'Dokumententypen': "
	auftrag_1_ge = "diesen 'Geschossen': "
	Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
	'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
	'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
	'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
	'Z-Lichtplanung']
	auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
	"Keine weiteren Ausführungen oder Erklärungen. " \
	"Antworte am besten in einem Wort. " \
	"Hier der Dokumenteninhalt: "
	Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
	'Prozessdokumentation', 'Fachdokumentation', 'Anlagedokumentation']
	ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
	'A', 'B', 'C', 'D', 'E', 'F', 'G']
	#print(str(Baubranchen_Disziplinen))
	main()