File size: 5,060 Bytes
33d569a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import streamlit as st
import os
import pandas as pd
from PyPDF2 import PdfReader
import openai
from collections import defaultdict
from io import StringIO
from pdfminer.high_level import extract_text  
import json
from openai import OpenAI
import re

# 1. Initialization
api_key = "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"  
openai.api_key = api_key
client = OpenAI(api_key=api_key)
pdf_folder = "pdf" 

st.title("Mahkamah Agung: NER & Summarization of Legal Documents")



#---------------------PDF OVERVIEW----------------------
st.subheader("PDF Folder Overview")
def get_pdf_details(folder_path):
    pdf_details = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            try:
                with open(pdf_path, "rb") as file:
                    pdf_reader = PdfReader(file)
                    page_count = len(pdf_reader.pages)
                    pdf_details.append({"Filename": filename, "Page Count": page_count})
            except Exception as e:
                st.warning(f"Could not read {filename}: {str(e)}")
    return pdf_details
pdf_list = get_pdf_details(pdf_folder)
pdf_df = pd.DataFrame(pdf_list)
if not pdf_df.empty:
    with st.expander('PDF Overview'):
       st.dataframe(pdf_df)
else:
    st.warning("No PDFs found in the specified folder.")



#---------------------PDF SEARCH AND EXTRACT----------------------
st.subheader("PDF to Text Conversion")

# Function to read and extract text from a PDF using PdfReader
def extract_text_from_pdf_pypdf2(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        pdf_reader = PdfReader(file)
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text




# Function to read and extract text from a PDF using pdfminer
def extract_text_from_pdf_pdfminer(pdf_path):
    # Extract text using pdfminer.six
    try:
        text = extract_text(pdf_path)
    except Exception as e:
        st.error(f"Error extracting text from {pdf_path}: {str(e)}")
        text = ""
    return text

pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
search_query = st.text_input("Search for a PDF")
filtered_pdfs = [pdf for pdf in pdf_files if search_query.lower() in pdf.lower()]

if filtered_pdfs:
    selected_pdf = st.selectbox("Select a PDF to convert to text", filtered_pdfs)
else:
    st.warning("No PDFs found matching your search.")

if st.button("analyze The Document"):
    pdf_path = os.path.join(pdf_folder, selected_pdf)
    extracted_text = extract_text_from_pdf_pdfminer(pdf_path)


    # Display the extracted text
    if extracted_text:
        with st.expander('Extracted Text'):
            st.text_area("Extracted Text", value=extracted_text, height=300)
    else:
        st.warning("No text extracted. The PDF might contain images or other non-text content.")





    # template = """
    #
    # #   Anda adalah seorang hakim agung di Mahkamah Agung di Indonesia. Dari hasil putusan dibawah ini berikan aku kesimpulannya:
    # {}
    # variabel yang harus ada adalah sebagai berikut: presiding judge, member judge, clerk, ruling, other rulings, note of ruling, date of deliberation, date read out, type of judicial institution, date of register, judicial institution, case_number, court, defendants.name, defendants.place_of_birth, defendants.date_of_birth, defendants.age, defendants.gender, defendants.nationality, defendants.religion, defendants.occupation, charges.article, charges.offense, verdict.sentence, verdict.assets_confiscated.description, verdict.assets_confiscated.weight, fine dan conclusion
    # # """

    template = """
        
        # Anda Adalah Seorang Hakim Agung Di Mahkamah Agung Di Indonesia. Berdasarkan Putusan Di Bawah Ini, Berikan Kesimpulannya:
        {}
        Variabel Yang Harus Ada Adalah Sebagai Berikut: Hakim Ketua, Hakim Anggota, Panitera, Putusan, Putusan Lainnya, Catatan Putusan, Tanggal Musyawarah, Tanggal Pembacaan, Jenis Institusi Yudisial, Tanggal Pendaftaran, Institusi Yudisial, Nomor Kasus, Pengadilan, Terdakwa.Nama, Terdakwa.Tempat_Lahir, Terdakwa.Tanggal_Lahir, Terdakwa.Usia, Terdakwa.Jenis_Kelamin, Terdakwa.Kebangsaan, Terdakwa.Agama, Terdakwa.Pekerjaan, Pasal_Dakwaan, Pelanggaran_Dakwaan, Vonis.Hukuman, Vonis.Atribut_Disita.Deskripsi, Vonis.Atribut_Disita.Berat, Denda, Dan Kesimpulan.
    # """



    #---------------------NER & SUMMARIZATION----------------------
    response = client.chat.completions.create(
    model="gpt-3.5-turbo-0125",
    response_format={ "type": "json_object" },
    messages=[
        {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
        {"role": "user", "content": template.format(extracted_text)}
    ]
    )



    data= json.loads(response.choices[0].message.content)
    df = pd.json_normalize(data)
    df=df.T
    df.columns = ["Kesimpulan Putusan"]
    st.dataframe(df)