destiratnakomala commited on
Commit
33d569a
1 Parent(s): fdbdac0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -0
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import pandas as pd
4
+ from PyPDF2 import PdfReader
5
+ import openai
6
+ from collections import defaultdict
7
+ from io import StringIO
8
+ from pdfminer.high_level import extract_text
9
+ import json
10
+ from openai import OpenAI
11
+ import re
12
+
13
+ # 1. Initialization
14
+ api_key = "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"
15
+ openai.api_key = api_key
16
+ client = OpenAI(api_key=api_key)
17
+ pdf_folder = "pdf"
18
+
19
+ st.title("Mahkamah Agung: NER & Summarization of Legal Documents")
20
+
21
+
22
+
23
+ #---------------------PDF OVERVIEW----------------------
24
+ st.subheader("PDF Folder Overview")
25
+ def get_pdf_details(folder_path):
26
+ pdf_details = []
27
+ for filename in os.listdir(folder_path):
28
+ if filename.lower().endswith('.pdf'):
29
+ pdf_path = os.path.join(folder_path, filename)
30
+ try:
31
+ with open(pdf_path, "rb") as file:
32
+ pdf_reader = PdfReader(file)
33
+ page_count = len(pdf_reader.pages)
34
+ pdf_details.append({"Filename": filename, "Page Count": page_count})
35
+ except Exception as e:
36
+ st.warning(f"Could not read {filename}: {str(e)}")
37
+ return pdf_details
38
+ pdf_list = get_pdf_details(pdf_folder)
39
+ pdf_df = pd.DataFrame(pdf_list)
40
+ if not pdf_df.empty:
41
+ with st.expander('PDF Overview'):
42
+ st.dataframe(pdf_df)
43
+ else:
44
+ st.warning("No PDFs found in the specified folder.")
45
+
46
+
47
+
48
+ #---------------------PDF SEARCH AND EXTRACT----------------------
49
+ st.subheader("PDF to Text Conversion")
50
+
51
+ # Function to read and extract text from a PDF using PdfReader
52
+ def extract_text_from_pdf_pypdf2(pdf_path):
53
+ text = ""
54
+ with open(pdf_path, "rb") as file:
55
+ pdf_reader = PdfReader(file)
56
+ for page in pdf_reader.pages:
57
+ page_text = page.extract_text()
58
+ if page_text:
59
+ text += page_text
60
+ return text
61
+
62
+
63
+
64
+
65
+ # Function to read and extract text from a PDF using pdfminer
66
+ def extract_text_from_pdf_pdfminer(pdf_path):
67
+ # Extract text using pdfminer.six
68
+ try:
69
+ text = extract_text(pdf_path)
70
+ except Exception as e:
71
+ st.error(f"Error extracting text from {pdf_path}: {str(e)}")
72
+ text = ""
73
+ return text
74
+
75
+ pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
76
+ search_query = st.text_input("Search for a PDF")
77
+ filtered_pdfs = [pdf for pdf in pdf_files if search_query.lower() in pdf.lower()]
78
+
79
+ if filtered_pdfs:
80
+ selected_pdf = st.selectbox("Select a PDF to convert to text", filtered_pdfs)
81
+ else:
82
+ st.warning("No PDFs found matching your search.")
83
+
84
+ if st.button("analyze The Document"):
85
+ pdf_path = os.path.join(pdf_folder, selected_pdf)
86
+ extracted_text = extract_text_from_pdf_pdfminer(pdf_path)
87
+
88
+
89
+ # Display the extracted text
90
+ if extracted_text:
91
+ with st.expander('Extracted Text'):
92
+ st.text_area("Extracted Text", value=extracted_text, height=300)
93
+ else:
94
+ st.warning("No text extracted. The PDF might contain images or other non-text content.")
95
+
96
+
97
+
98
+
99
+
100
+ # template = """
101
+ #
102
+ # # Anda adalah seorang hakim agung di Mahkamah Agung di Indonesia. Dari hasil putusan dibawah ini berikan aku kesimpulannya:
103
+ # {}
104
+ # variabel yang harus ada adalah sebagai berikut: presiding judge, member judge, clerk, ruling, other rulings, note of ruling, date of deliberation, date read out, type of judicial institution, date of register, judicial institution, case_number, court, defendants.name, defendants.place_of_birth, defendants.date_of_birth, defendants.age, defendants.gender, defendants.nationality, defendants.religion, defendants.occupation, charges.article, charges.offense, verdict.sentence, verdict.assets_confiscated.description, verdict.assets_confiscated.weight, fine dan conclusion
105
+ # # """
106
+
107
+ template = """
108
+
109
+ # Anda Adalah Seorang Hakim Agung Di Mahkamah Agung Di Indonesia. Berdasarkan Putusan Di Bawah Ini, Berikan Kesimpulannya:
110
+ {}
111
+ Variabel Yang Harus Ada Adalah Sebagai Berikut: Hakim Ketua, Hakim Anggota, Panitera, Putusan, Putusan Lainnya, Catatan Putusan, Tanggal Musyawarah, Tanggal Pembacaan, Jenis Institusi Yudisial, Tanggal Pendaftaran, Institusi Yudisial, Nomor Kasus, Pengadilan, Terdakwa.Nama, Terdakwa.Tempat_Lahir, Terdakwa.Tanggal_Lahir, Terdakwa.Usia, Terdakwa.Jenis_Kelamin, Terdakwa.Kebangsaan, Terdakwa.Agama, Terdakwa.Pekerjaan, Pasal_Dakwaan, Pelanggaran_Dakwaan, Vonis.Hukuman, Vonis.Atribut_Disita.Deskripsi, Vonis.Atribut_Disita.Berat, Denda, Dan Kesimpulan.
112
+ # """
113
+
114
+
115
+
116
+ #---------------------NER & SUMMARIZATION----------------------
117
+ response = client.chat.completions.create(
118
+ model="gpt-3.5-turbo-0125",
119
+ response_format={ "type": "json_object" },
120
+ messages=[
121
+ {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
122
+ {"role": "user", "content": template.format(extracted_text)}
123
+ ]
124
+ )
125
+
126
+
127
+
128
+ data= json.loads(response.choices[0].message.content)
129
+ df = pd.json_normalize(data)
130
+ df=df.T
131
+ df.columns = ["Kesimpulan Putusan"]
132
+ st.dataframe(df)
133
+