arithescientist commited on
Commit
c3c2470
·
verified ·
1 Parent(s): fc2a37d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -23
app.py CHANGED
@@ -8,27 +8,34 @@ from pdfminer.high_level import extract_text
8
  from docx import Document
9
  from reportlab.lib.pagesizes import letter
10
  from reportlab.pdfgen import canvas
 
 
11
  import spacy
12
 
13
- # Load spaCy English model
14
- nlp = spacy.load("en_core_web_sm")
 
 
 
 
 
15
 
16
- # Load the LegalBERT model and tokenizer
17
- tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
18
  model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
19
 
20
  # Convert DOCX to PDF using ReportLab
21
  def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
22
  doc = Document(docx_file)
23
  full_text = [para.text for para in doc.paragraphs]
24
-
25
  pdf = canvas.Canvas(output_pdf, pagesize=letter)
26
  pdf.setFont("Helvetica", 12)
27
-
28
  text_object = pdf.beginText(40, 750)
29
  for line in full_text:
30
  text_object.textLine(line)
31
-
32
  pdf.drawText(text_object)
33
  pdf.save()
34
  return output_pdf
@@ -73,9 +80,9 @@ def pdf_to_text(text, PDF, num_sentences=5):
73
  pass # Use the text input provided by the user
74
  else:
75
  return None, "Please provide input text or upload a file.", None
76
-
77
  summary = extractive_summarization(text, num_sentences)
78
-
79
  # Generate a PDF of the summary
80
  pdf = FPDF()
81
  pdf.add_page()
@@ -83,14 +90,14 @@ def pdf_to_text(text, PDF, num_sentences=5):
83
  pdf.multi_cell(190, 10, txt=summary, align='L')
84
  pdf_output_path = "legal_summary.pdf"
85
  pdf.output(pdf_output_path)
86
-
87
  # Generate an audio file of the summary
88
  audio_output_path = "legal_summary.wav"
89
  tts = gTTS(text=summary, lang='en', slow=False)
90
  tts.save(audio_output_path)
91
-
92
  return audio_output_path, summary, pdf_output_path
93
-
94
  except Exception as e:
95
  return None, f"An error occurred: {str(e)}", None
96
 
@@ -104,35 +111,35 @@ def process_sample_document(num_sentences=5):
104
  with gr.Blocks() as iface:
105
  with gr.Row():
106
  process_sample_button = gr.Button("Summarize Marbury v. Madison Case (Pre-Uploaded)")
107
-
108
  text_input = gr.Textbox(label="Input Text")
109
  file_input = gr.File(label="Upload PDF or DOCX")
110
  slider = gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Number of Summary Sentences")
111
-
112
  audio_output = gr.Audio(label="Generated Audio")
113
  summary_output = gr.Textbox(label="Generated Summary")
114
  pdf_output = gr.File(label="Summary PDF")
115
-
116
  # Update the function calls to match new parameters
117
  process_sample_button.click(
118
- fn=process_sample_document,
119
- inputs=slider,
120
  outputs=[audio_output, summary_output, pdf_output]
121
  )
122
  # Use submit event for the text input and file input
123
  def on_submit(text, file, num_sentences):
124
  return pdf_to_text(text, file, num_sentences)
125
-
126
  text_input.submit(
127
- fn=on_submit,
128
- inputs=[text_input, file_input, slider],
129
  outputs=[audio_output, summary_output, pdf_output]
130
  )
131
  file_input.change(
132
- fn=on_submit,
133
- inputs=[text_input, file_input, slider],
134
  outputs=[audio_output, summary_output, pdf_output]
135
  )
136
-
137
  if __name__ == "__main__":
138
  iface.launch()
 
8
  from docx import Document
9
  from reportlab.lib.pagesizes import letter
10
  from reportlab.pdfgen import canvas
11
+
12
+ # Import spaCy and handle model loading
13
  import spacy
14
 
15
+ try:
16
+ nlp = spacy.load("en_core_web_sm")
17
+ except OSError:
18
+ # Download the model if not found
19
+ from spacy.cli import download
20
+ download("en_core_web_sm")
21
+ nlp = spacy.load("en_core_web_sm")
22
 
23
+ # Load the LegalBERT model and tokenizer with use_fast=False
24
+ tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased", use_fast=False)
25
  model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
26
 
27
  # Convert DOCX to PDF using ReportLab
28
  def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
29
  doc = Document(docx_file)
30
  full_text = [para.text for para in doc.paragraphs]
31
+
32
  pdf = canvas.Canvas(output_pdf, pagesize=letter)
33
  pdf.setFont("Helvetica", 12)
34
+
35
  text_object = pdf.beginText(40, 750)
36
  for line in full_text:
37
  text_object.textLine(line)
38
+
39
  pdf.drawText(text_object)
40
  pdf.save()
41
  return output_pdf
 
80
  pass # Use the text input provided by the user
81
  else:
82
  return None, "Please provide input text or upload a file.", None
83
+
84
  summary = extractive_summarization(text, num_sentences)
85
+
86
  # Generate a PDF of the summary
87
  pdf = FPDF()
88
  pdf.add_page()
 
90
  pdf.multi_cell(190, 10, txt=summary, align='L')
91
  pdf_output_path = "legal_summary.pdf"
92
  pdf.output(pdf_output_path)
93
+
94
  # Generate an audio file of the summary
95
  audio_output_path = "legal_summary.wav"
96
  tts = gTTS(text=summary, lang='en', slow=False)
97
  tts.save(audio_output_path)
98
+
99
  return audio_output_path, summary, pdf_output_path
100
+
101
  except Exception as e:
102
  return None, f"An error occurred: {str(e)}", None
103
 
 
111
  with gr.Blocks() as iface:
112
  with gr.Row():
113
  process_sample_button = gr.Button("Summarize Marbury v. Madison Case (Pre-Uploaded)")
114
+
115
  text_input = gr.Textbox(label="Input Text")
116
  file_input = gr.File(label="Upload PDF or DOCX")
117
  slider = gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Number of Summary Sentences")
118
+
119
  audio_output = gr.Audio(label="Generated Audio")
120
  summary_output = gr.Textbox(label="Generated Summary")
121
  pdf_output = gr.File(label="Summary PDF")
122
+
123
  # Update the function calls to match new parameters
124
  process_sample_button.click(
125
+ fn=process_sample_document,
126
+ inputs=slider,
127
  outputs=[audio_output, summary_output, pdf_output]
128
  )
129
  # Use submit event for the text input and file input
130
  def on_submit(text, file, num_sentences):
131
  return pdf_to_text(text, file, num_sentences)
132
+
133
  text_input.submit(
134
+ fn=on_submit,
135
+ inputs=[text_input, file_input, slider],
136
  outputs=[audio_output, summary_output, pdf_output]
137
  )
138
  file_input.change(
139
+ fn=on_submit,
140
+ inputs=[text_input, file_input, slider],
141
  outputs=[audio_output, summary_output, pdf_output]
142
  )
143
+
144
  if __name__ == "__main__":
145
  iface.launch()