Spaces:
Runtime error
Runtime error
import streamlit as st | |
import os | |
import pandas as pd | |
from PyPDF2 import PdfReader | |
import openai | |
from collections import defaultdict | |
from io import StringIO | |
from pdfminer.high_level import extract_text | |
import json | |
from openai import OpenAI | |
import re | |
# 1. Initialization | |
api_key = "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" | |
openai.api_key = api_key | |
client = OpenAI(api_key=api_key) | |
pdf_folder = "pdf" | |
st.title("Mahkamah Agung: NER & Summarization of Legal Documents") | |
#---------------------PDF OVERVIEW---------------------- | |
st.subheader("PDF Folder Overview") | |
def get_pdf_details(folder_path): | |
pdf_details = [] | |
for filename in os.listdir(folder_path): | |
if filename.lower().endswith('.pdf'): | |
pdf_path = os.path.join(folder_path, filename) | |
try: | |
with open(pdf_path, "rb") as file: | |
pdf_reader = PdfReader(file) | |
page_count = len(pdf_reader.pages) | |
pdf_details.append({"Filename": filename, "Page Count": page_count}) | |
except Exception as e: | |
st.warning(f"Could not read {filename}: {str(e)}") | |
return pdf_details | |
pdf_list = get_pdf_details(pdf_folder) | |
pdf_df = pd.DataFrame(pdf_list) | |
if not pdf_df.empty: | |
with st.expander('PDF Overview'): | |
st.dataframe(pdf_df) | |
else: | |
st.warning("No PDFs found in the specified folder.") | |
#---------------------PDF SEARCH AND EXTRACT---------------------- | |
st.subheader("PDF to Text Conversion") | |
# Function to read and extract text from a PDF using PdfReader | |
def extract_text_from_pdf_pypdf2(pdf_path): | |
text = "" | |
with open(pdf_path, "rb") as file: | |
pdf_reader = PdfReader(file) | |
for page in pdf_reader.pages: | |
page_text = page.extract_text() | |
if page_text: | |
text += page_text | |
return text | |
# Function to read and extract text from a PDF using pdfminer | |
def extract_text_from_pdf_pdfminer(pdf_path): | |
# Extract text using pdfminer.six | |
try: | |
text = extract_text(pdf_path) | |
except Exception as e: | |
st.error(f"Error extracting text from {pdf_path}: {str(e)}") | |
text = "" | |
return text | |
pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')] | |
search_query = st.text_input("Search for a PDF") | |
filtered_pdfs = [pdf for pdf in pdf_files if search_query.lower() in pdf.lower()] | |
if filtered_pdfs: | |
selected_pdf = st.selectbox("Select a PDF to convert to text", filtered_pdfs) | |
else: | |
st.warning("No PDFs found matching your search.") | |
if st.button("analyze The Document"): | |
pdf_path = os.path.join(pdf_folder, selected_pdf) | |
extracted_text = extract_text_from_pdf_pdfminer(pdf_path) | |
# Display the extracted text | |
if extracted_text: | |
with st.expander('Extracted Text'): | |
st.text_area("Extracted Text", value=extracted_text, height=300) | |
else: | |
st.warning("No text extracted. The PDF might contain images or other non-text content.") | |
# template = """ | |
# | |
# # Anda adalah seorang hakim agung di Mahkamah Agung di Indonesia. Dari hasil putusan dibawah ini berikan aku kesimpulannya: | |
# {} | |
# variabel yang harus ada adalah sebagai berikut: presiding judge, member judge, clerk, ruling, other rulings, note of ruling, date of deliberation, date read out, type of judicial institution, date of register, judicial institution, case_number, court, defendants.name, defendants.place_of_birth, defendants.date_of_birth, defendants.age, defendants.gender, defendants.nationality, defendants.religion, defendants.occupation, charges.article, charges.offense, verdict.sentence, verdict.assets_confiscated.description, verdict.assets_confiscated.weight, fine dan conclusion | |
# # """ | |
template = """ | |
# Anda Adalah Seorang Hakim Agung Di Mahkamah Agung Di Indonesia. Berdasarkan Putusan Di Bawah Ini, Berikan Kesimpulannya: | |
{} | |
Variabel Yang Harus Ada Adalah Sebagai Berikut: Hakim Ketua, Hakim Anggota, Panitera, Putusan, Putusan Lainnya, Catatan Putusan, Tanggal Musyawarah, Tanggal Pembacaan, Jenis Institusi Yudisial, Tanggal Pendaftaran, Institusi Yudisial, Nomor Kasus, Pengadilan, Terdakwa.Nama, Terdakwa.Tempat_Lahir, Terdakwa.Tanggal_Lahir, Terdakwa.Usia, Terdakwa.Jenis_Kelamin, Terdakwa.Kebangsaan, Terdakwa.Agama, Terdakwa.Pekerjaan, Pasal_Dakwaan, Pelanggaran_Dakwaan, Vonis.Hukuman, Vonis.Atribut_Disita.Deskripsi, Vonis.Atribut_Disita.Berat, Denda, Dan Kesimpulan. | |
# """ | |
#---------------------NER & SUMMARIZATION---------------------- | |
response = client.chat.completions.create( | |
model="gpt-3.5-turbo-0125", | |
response_format={ "type": "json_object" }, | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant designed to output JSON."}, | |
{"role": "user", "content": template.format(extracted_text)} | |
] | |
) | |
data= json.loads(response.choices[0].message.content) | |
df = pd.json_normalize(data) | |
df=df.T | |
df.columns = ["Kesimpulan Putusan"] | |
st.dataframe(df) | |