Ari commited on
Commit
deb55dd
1 Parent(s): c0d316e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -27
app.py CHANGED
@@ -9,28 +9,40 @@ from docx import Document
9
  from reportlab.lib.pagesizes import letter
10
  from reportlab.pdfgen import canvas
11
 
12
- # Switch to a more lightweight model like DistilBART for faster processing
13
- tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
14
- model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
15
 
16
- # Function to chunk text based on token length
17
  def chunk_text(text, max_token_len=512):
18
- tokens = tokenizer(text, return_tensors="pt", truncation=False, padding=False)["input_ids"].squeeze()
19
- total_length = len(tokens)
20
- # Split text into manageable token chunks
21
- chunks = [tokens[i:i+max_token_len] for i in range(0, total_length, max_token_len)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  return chunks
23
 
24
  def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
25
  doc = Document(docx_file)
26
- full_text = []
27
- for para in doc.paragraphs:
28
- full_text.append(para.text)
29
 
30
  pdf = canvas.Canvas(output_pdf, pagesize=letter)
31
  pdf.setFont("Helvetica", 12)
32
-
33
  text = pdf.beginText(40, 750)
 
34
  for line in full_text:
35
  text.textLine(line)
36
 
@@ -38,13 +50,13 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
38
  pdf.save()
39
  return output_pdf
40
 
41
- # Summarize each chunk of tokens
42
  def summarize_chunk(chunk, min_length=50, max_length=150):
43
- inputs = {"input_ids": chunk.unsqueeze(0)} # Add batch dimension
44
- summary_ids = model.generate(inputs["input_ids"], num_beams=1, min_length=min_length, max_length=max_length)
45
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
46
 
47
- # Main processing function
48
  def pdf_to_text(text, PDF, min_length=50):
49
  try:
50
  file_extension = os.path.splitext(PDF.name)[1].lower()
@@ -55,33 +67,31 @@ def pdf_to_text(text, PDF, min_length=50):
55
  elif file_extension == '.pdf' and text == "":
56
  text = extract_text(PDF.name)
57
 
58
- # Split text into token-based chunks
59
  chunks = chunk_text(text)
60
- summarized_text = ""
61
-
62
- # Summarize each chunk
63
- for chunk in chunks:
64
- summarized_text += summarize_chunk(chunk, min_length=min_length, max_length=min_length + 100) + "\n\n"
65
-
66
  # Save summarized text to PDF
67
  pdf = FPDF()
68
  pdf.add_page()
69
  pdf.set_font("Times", size=12)
70
- pdf.multi_cell(190, 10, txt=summarized_text, align='C')
71
  pdf_output_path = "legal.pdf"
72
  pdf.output(pdf_output_path)
73
 
74
  # Convert summarized text to audio
75
  audio_output_path = "legal.wav"
76
- tts = gTTS(text=summarized_text, lang='en', slow=False)
77
  tts.save(audio_output_path)
78
 
79
- return audio_output_path, summarized_text, pdf_output_path
80
 
81
  except Exception as e:
82
  return None, f"An error occurred: {str(e)}", None
83
 
84
- # Preloaded document processor
85
  def process_sample_document(min_length=50):
86
  sample_document_path = "Marbury v. Madison.pdf"
87
 
 
9
  from reportlab.lib.pagesizes import letter
10
  from reportlab.pdfgen import canvas
11
 
12
+ # Use LegalBERT for handling legal documents
13
+ tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
14
+ model = AutoModelForSeq2SeqLM.from_pretrained("nlpaueb/legal-bert-base-uncased")
15
 
16
+ # Function to chunk the text into manageable pieces
17
  def chunk_text(text, max_token_len=512):
18
+ sentences = re.split(r'(?<=[.!?]) +', text)
19
+ chunks = []
20
+ current_chunk = []
21
+ current_length = 0
22
+
23
+ for sentence in sentences:
24
+ tokens = tokenizer.tokenize(sentence)
25
+ if current_length + len(tokens) <= max_token_len:
26
+ current_chunk.append(sentence)
27
+ current_length += len(tokens)
28
+ else:
29
+ chunks.append(" ".join(current_chunk))
30
+ current_chunk = [sentence]
31
+ current_length = len(tokens)
32
+
33
+ if current_chunk:
34
+ chunks.append(" ".join(current_chunk))
35
+
36
  return chunks
37
 
38
  def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
39
  doc = Document(docx_file)
40
+ full_text = [para.text for para in doc.paragraphs]
 
 
41
 
42
  pdf = canvas.Canvas(output_pdf, pagesize=letter)
43
  pdf.setFont("Helvetica", 12)
 
44
  text = pdf.beginText(40, 750)
45
+
46
  for line in full_text:
47
  text.textLine(line)
48
 
 
50
  pdf.save()
51
  return output_pdf
52
 
53
+ # Summarize each chunk and then recursively summarize the summaries
54
  def summarize_chunk(chunk, min_length=50, max_length=150):
55
+ inputs = tokenizer([chunk], max_length=512, truncation=True, return_tensors="pt")
56
+ summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=max_length)
57
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
58
 
59
+ # Main processing function using recursive summarization
60
  def pdf_to_text(text, PDF, min_length=50):
61
  try:
62
  file_extension = os.path.splitext(PDF.name)[1].lower()
 
67
  elif file_extension == '.pdf' and text == "":
68
  text = extract_text(PDF.name)
69
 
 
70
  chunks = chunk_text(text)
71
+ summarized_chunks = [summarize_chunk(chunk, min_length=min_length) for chunk in chunks]
72
+
73
+ # Combine summaries and recursively summarize the combined text
74
+ summarized_text = " ".join(summarized_chunks)
75
+ final_summary = summarize_chunk(summarized_text, min_length=min_length, max_length=min_length+150)
76
+
77
  # Save summarized text to PDF
78
  pdf = FPDF()
79
  pdf.add_page()
80
  pdf.set_font("Times", size=12)
81
+ pdf.multi_cell(190, 10, txt=final_summary, align='C')
82
  pdf_output_path = "legal.pdf"
83
  pdf.output(pdf_output_path)
84
 
85
  # Convert summarized text to audio
86
  audio_output_path = "legal.wav"
87
+ tts = gTTS(text=final_summary, lang='en', slow=False)
88
  tts.save(audio_output_path)
89
 
90
+ return audio_output_path, final_summary, pdf_output_path
91
 
92
  except Exception as e:
93
  return None, f"An error occurred: {str(e)}", None
94
 
 
95
  def process_sample_document(min_length=50):
96
  sample_document_path = "Marbury v. Madison.pdf"
97