Ari commited on
Commit
f336636
1 Parent(s): 204d8e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -31
app.py CHANGED
@@ -9,18 +9,16 @@ from pdfminer.high_level import extract_text
9
  from docx import Document
10
  from reportlab.lib.pagesizes import letter
11
  from reportlab.pdfgen import canvas
 
12
 
13
- # Load the models and tokenizer
14
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
15
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
16
 
17
  # Function to chunk text into sentence-based chunks
18
  def chunk_text(text, max_token_len=1024):
19
- # Split text into sentences
20
  sentences = [sent.strip() + '.' for sent in re.split(r'(?<!\d)\.\s', text) if len(sent) > 1]
21
  token_lengths = [len(tokenizer.tokenize(sent)) for sent in sentences]
22
 
23
- # Initialize chunking
24
  chunk_size = max_token_len
25
  chunks = []
26
  current_chunk = []
@@ -40,46 +38,30 @@ def chunk_text(text, max_token_len=1024):
40
 
41
  return chunks
42
 
43
- def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
44
- doc = Document(docx_file)
45
- full_text = []
46
- for para in doc.paragraphs:
47
- full_text.append(para.text)
48
-
49
- pdf = canvas.Canvas(output_pdf, pagesize=letter)
50
- pdf.setFont("Helvetica", 12)
51
-
52
- text = pdf.beginText(40, 750)
53
- for line in full_text:
54
- text.textLine(line)
55
-
56
- pdf.drawText(text)
57
- pdf.save()
58
- return output_pdf
59
 
60
- # Main processing function using sentence-based chunking
61
  def pdf_to_text(text, PDF, min_length=80):
62
  try:
63
  file_extension = os.path.splitext(PDF.name)[1].lower()
64
 
65
- # If DOCX, convert to PDF
66
  if file_extension == '.docx':
67
  pdf_file_path = docx_to_pdf(PDF.name)
68
  text = extract_text(pdf_file_path)
69
- # If PDF, extract text
70
  elif file_extension == '.pdf' and text == "":
71
  text = extract_text(PDF.name)
72
 
73
- # Split text into chunks based on sentence boundaries
74
  chunks = chunk_text(text)
75
  summarized_text = ""
76
 
77
- for chunk in chunks:
78
- inputs = tokenizer([chunk], max_length=1024, truncation=True, return_tensors="pt")
79
- min_length = int(min_length)
80
- summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length + 400)
81
- output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
82
- summarized_text += output_text + "\n\n"
83
 
84
  # Save summarized text to PDF
85
  pdf = FPDF()
@@ -99,7 +81,6 @@ def pdf_to_text(text, PDF, min_length=80):
99
  except Exception as e:
100
  return None, f"An error occurred: {str(e)}", None
101
 
102
- # Preloaded document processor
103
  def process_sample_document(min_length=80):
104
  sample_document_path = "Marbury v. Madison.pdf"
105
 
@@ -109,7 +90,7 @@ def process_sample_document(min_length=80):
109
  # Gradio interface
110
  with gr.Blocks() as iface:
111
  with gr.Row():
112
- process_sample_button = gr.Button("Summarize Marbury v. Madison Case Pre-Uploaded")
113
 
114
  text_input = gr.Textbox(label="Input Text")
115
  file_input = gr.File(label="Upload PDF or DOCX")
 
9
  from docx import Document
10
  from reportlab.lib.pagesizes import letter
11
  from reportlab.pdfgen import canvas
12
+ from concurrent.futures import ThreadPoolExecutor
13
 
 
14
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
15
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
16
 
17
  # Function to chunk text into sentence-based chunks
18
  def chunk_text(text, max_token_len=1024):
 
19
  sentences = [sent.strip() + '.' for sent in re.split(r'(?<!\d)\.\s', text) if len(sent) > 1]
20
  token_lengths = [len(tokenizer.tokenize(sent)) for sent in sentences]
21
 
 
22
  chunk_size = max_token_len
23
  chunks = []
24
  current_chunk = []
 
38
 
39
  return chunks
40
 
41
+ # Summarization function
42
+ def summarize_chunk(chunk, min_length=80):
43
+ inputs = tokenizer([chunk], max_length=1024, truncation=True, return_tensors="pt")
44
+ summary_ids = model.generate(inputs["input_ids"], num_beams=1, min_length=min_length, max_length=min_length + 300)
45
+ return tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ # Main processing function using parallel summarization
48
  def pdf_to_text(text, PDF, min_length=80):
49
  try:
50
  file_extension = os.path.splitext(PDF.name)[1].lower()
51
 
 
52
  if file_extension == '.docx':
53
  pdf_file_path = docx_to_pdf(PDF.name)
54
  text = extract_text(pdf_file_path)
 
55
  elif file_extension == '.pdf' and text == "":
56
  text = extract_text(PDF.name)
57
 
 
58
  chunks = chunk_text(text)
59
  summarized_text = ""
60
 
61
+ # Parallelize summarization using ThreadPoolExecutor
62
+ with ThreadPoolExecutor() as executor:
63
+ summaries = list(executor.map(lambda chunk: summarize_chunk(chunk, min_length), chunks))
64
+ summarized_text = "\n\n".join(summaries)
 
 
65
 
66
  # Save summarized text to PDF
67
  pdf = FPDF()
 
81
  except Exception as e:
82
  return None, f"An error occurred: {str(e)}", None
83
 
 
84
  def process_sample_document(min_length=80):
85
  sample_document_path = "Marbury v. Madison.pdf"
86
 
 
90
  # Gradio interface
91
  with gr.Blocks() as iface:
92
  with gr.Row():
93
+ process_sample_button = gr.Button("Summarize Pre-Uploaded Marbury v. Madison Case Document")
94
 
95
  text_input = gr.Textbox(label="Input Text")
96
  file_input = gr.File(label="Upload PDF or DOCX")