MA_check / app.py
destiratnakomala's picture
Create app.py
33d569a verified
raw
history blame
5.06 kB
import streamlit as st
import os
import pandas as pd
from PyPDF2 import PdfReader
import openai
from collections import defaultdict
from io import StringIO
from pdfminer.high_level import extract_text
import json
from openai import OpenAI
import re
# 1. Initialization
api_key = "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"
openai.api_key = api_key
client = OpenAI(api_key=api_key)
pdf_folder = "pdf"
st.title("Mahkamah Agung: NER & Summarization of Legal Documents")
#---------------------PDF OVERVIEW----------------------
st.subheader("PDF Folder Overview")
def get_pdf_details(folder_path):
pdf_details = []
for filename in os.listdir(folder_path):
if filename.lower().endswith('.pdf'):
pdf_path = os.path.join(folder_path, filename)
try:
with open(pdf_path, "rb") as file:
pdf_reader = PdfReader(file)
page_count = len(pdf_reader.pages)
pdf_details.append({"Filename": filename, "Page Count": page_count})
except Exception as e:
st.warning(f"Could not read {filename}: {str(e)}")
return pdf_details
pdf_list = get_pdf_details(pdf_folder)
pdf_df = pd.DataFrame(pdf_list)
if not pdf_df.empty:
with st.expander('PDF Overview'):
st.dataframe(pdf_df)
else:
st.warning("No PDFs found in the specified folder.")
#---------------------PDF SEARCH AND EXTRACT----------------------
st.subheader("PDF to Text Conversion")
# Function to read and extract text from a PDF using PdfReader
def extract_text_from_pdf_pypdf2(pdf_path):
text = ""
with open(pdf_path, "rb") as file:
pdf_reader = PdfReader(file)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
# Function to read and extract text from a PDF using pdfminer
def extract_text_from_pdf_pdfminer(pdf_path):
# Extract text using pdfminer.six
try:
text = extract_text(pdf_path)
except Exception as e:
st.error(f"Error extracting text from {pdf_path}: {str(e)}")
text = ""
return text
pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
search_query = st.text_input("Search for a PDF")
filtered_pdfs = [pdf for pdf in pdf_files if search_query.lower() in pdf.lower()]
if filtered_pdfs:
selected_pdf = st.selectbox("Select a PDF to convert to text", filtered_pdfs)
else:
st.warning("No PDFs found matching your search.")
if st.button("analyze The Document"):
pdf_path = os.path.join(pdf_folder, selected_pdf)
extracted_text = extract_text_from_pdf_pdfminer(pdf_path)
# Display the extracted text
if extracted_text:
with st.expander('Extracted Text'):
st.text_area("Extracted Text", value=extracted_text, height=300)
else:
st.warning("No text extracted. The PDF might contain images or other non-text content.")
# template = """
#
# # Anda adalah seorang hakim agung di Mahkamah Agung di Indonesia. Dari hasil putusan dibawah ini berikan aku kesimpulannya:
# {}
# variabel yang harus ada adalah sebagai berikut: presiding judge, member judge, clerk, ruling, other rulings, note of ruling, date of deliberation, date read out, type of judicial institution, date of register, judicial institution, case_number, court, defendants.name, defendants.place_of_birth, defendants.date_of_birth, defendants.age, defendants.gender, defendants.nationality, defendants.religion, defendants.occupation, charges.article, charges.offense, verdict.sentence, verdict.assets_confiscated.description, verdict.assets_confiscated.weight, fine dan conclusion
# # """
template = """
# Anda Adalah Seorang Hakim Agung Di Mahkamah Agung Di Indonesia. Berdasarkan Putusan Di Bawah Ini, Berikan Kesimpulannya:
{}
Variabel Yang Harus Ada Adalah Sebagai Berikut: Hakim Ketua, Hakim Anggota, Panitera, Putusan, Putusan Lainnya, Catatan Putusan, Tanggal Musyawarah, Tanggal Pembacaan, Jenis Institusi Yudisial, Tanggal Pendaftaran, Institusi Yudisial, Nomor Kasus, Pengadilan, Terdakwa.Nama, Terdakwa.Tempat_Lahir, Terdakwa.Tanggal_Lahir, Terdakwa.Usia, Terdakwa.Jenis_Kelamin, Terdakwa.Kebangsaan, Terdakwa.Agama, Terdakwa.Pekerjaan, Pasal_Dakwaan, Pelanggaran_Dakwaan, Vonis.Hukuman, Vonis.Atribut_Disita.Deskripsi, Vonis.Atribut_Disita.Berat, Denda, Dan Kesimpulan.
# """
#---------------------NER & SUMMARIZATION----------------------
response = client.chat.completions.create(
model="gpt-3.5-turbo-0125",
response_format={ "type": "json_object" },
messages=[
{"role": "system", "content": "You are a helpful assistant designed to output JSON."},
{"role": "user", "content": template.format(extracted_text)}
]
)
data= json.loads(response.choices[0].message.content)
df = pd.json_normalize(data)
df=df.T
df.columns = ["Kesimpulan Putusan"]
st.dataframe(df)