import streamlit as st import os import pandas as pd from PyPDF2 import PdfReader import openai from collections import defaultdict from io import StringIO from pdfminer.high_level import extract_text import json from openai import OpenAI import re # 1. Initialization api_key = "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" openai.api_key = api_key client = OpenAI(api_key=api_key) pdf_folder = "pdf" st.title("Mahkamah Agung: NER & Summarization of Legal Documents") #---------------------PDF OVERVIEW---------------------- st.subheader("PDF Folder Overview") def get_pdf_details(folder_path): pdf_details = [] for filename in os.listdir(folder_path): if filename.lower().endswith('.pdf'): pdf_path = os.path.join(folder_path, filename) try: with open(pdf_path, "rb") as file: pdf_reader = PdfReader(file) page_count = len(pdf_reader.pages) pdf_details.append({"Filename": filename, "Page Count": page_count}) except Exception as e: st.warning(f"Could not read {filename}: {str(e)}") return pdf_details pdf_list = get_pdf_details(pdf_folder) pdf_df = pd.DataFrame(pdf_list) if not pdf_df.empty: with st.expander('PDF Overview'): st.dataframe(pdf_df) else: st.warning("No PDFs found in the specified folder.") #---------------------PDF SEARCH AND EXTRACT---------------------- st.subheader("PDF to Text Conversion") # Function to read and extract text from a PDF using PdfReader def extract_text_from_pdf_pypdf2(pdf_path): text = "" with open(pdf_path, "rb") as file: pdf_reader = PdfReader(file) for page in pdf_reader.pages: page_text = page.extract_text() if page_text: text += page_text return text # Function to read and extract text from a PDF using pdfminer def extract_text_from_pdf_pdfminer(pdf_path): # Extract text using pdfminer.six try: text = extract_text(pdf_path) except Exception as e: st.error(f"Error extracting text from {pdf_path}: {str(e)}") text = "" return text pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')] search_query = st.text_input("Search for a PDF") filtered_pdfs = [pdf for pdf in pdf_files if search_query.lower() in pdf.lower()] if filtered_pdfs: selected_pdf = st.selectbox("Select a PDF to convert to text", filtered_pdfs) else: st.warning("No PDFs found matching your search.") if st.button("analyze The Document"): pdf_path = os.path.join(pdf_folder, selected_pdf) extracted_text = extract_text_from_pdf_pdfminer(pdf_path) # Display the extracted text if extracted_text: with st.expander('Extracted Text'): st.text_area("Extracted Text", value=extracted_text, height=300) else: st.warning("No text extracted. The PDF might contain images or other non-text content.") # template = """ # # # Anda adalah seorang hakim agung di Mahkamah Agung di Indonesia. Dari hasil putusan dibawah ini berikan aku kesimpulannya: # {} # variabel yang harus ada adalah sebagai berikut: presiding judge, member judge, clerk, ruling, other rulings, note of ruling, date of deliberation, date read out, type of judicial institution, date of register, judicial institution, case_number, court, defendants.name, defendants.place_of_birth, defendants.date_of_birth, defendants.age, defendants.gender, defendants.nationality, defendants.religion, defendants.occupation, charges.article, charges.offense, verdict.sentence, verdict.assets_confiscated.description, verdict.assets_confiscated.weight, fine dan conclusion # # """ template = """ # Anda Adalah Seorang Hakim Agung Di Mahkamah Agung Di Indonesia. Berdasarkan Putusan Di Bawah Ini, Berikan Kesimpulannya: {} Variabel Yang Harus Ada Adalah Sebagai Berikut: Hakim Ketua, Hakim Anggota, Panitera, Putusan, Putusan Lainnya, Catatan Putusan, Tanggal Musyawarah, Tanggal Pembacaan, Jenis Institusi Yudisial, Tanggal Pendaftaran, Institusi Yudisial, Nomor Kasus, Pengadilan, Terdakwa.Nama, Terdakwa.Tempat_Lahir, Terdakwa.Tanggal_Lahir, Terdakwa.Usia, Terdakwa.Jenis_Kelamin, Terdakwa.Kebangsaan, Terdakwa.Agama, Terdakwa.Pekerjaan, Pasal_Dakwaan, Pelanggaran_Dakwaan, Vonis.Hukuman, Vonis.Atribut_Disita.Deskripsi, Vonis.Atribut_Disita.Berat, Denda, Dan Kesimpulan. # """ #---------------------NER & SUMMARIZATION---------------------- response = client.chat.completions.create( model="gpt-3.5-turbo-0125", response_format={ "type": "json_object" }, messages=[ {"role": "system", "content": "You are a helpful assistant designed to output JSON."}, {"role": "user", "content": template.format(extracted_text)} ] ) data= json.loads(response.choices[0].message.content) df = pd.json_normalize(data) df=df.T df.columns = ["Kesimpulan Putusan"] st.dataframe(df)