Ari commited on
Commit
9d0e6a8
1 Parent(s): c8bcda4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -12
app.py CHANGED
@@ -14,6 +14,24 @@ nltk.download('punkt')
14
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
15
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
18
  doc = Document(docx_file)
19
  full_text = []
@@ -31,7 +49,8 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
31
  pdf.save()
32
  return output_pdf
33
 
34
- def pdf_to_text(text, PDF, min_length=80): # Increase default min_length by 4 times
 
35
  try:
36
  file_extension = os.path.splitext(PDF.name)[1].lower()
37
 
@@ -41,25 +60,28 @@ def pdf_to_text(text, PDF, min_length=80): # Increase default min_length by 4 t
41
  elif file_extension == '.pdf' and text == "":
42
  text = extract_text(PDF.name)
43
 
44
- inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors="pt")
45
- min_length = int(min_length)
46
-
47
- # Explicitly setting clean_up_tokenization_spaces=True to match the future default behavior
48
- summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length + 4000)
49
- output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
50
-
 
 
 
51
  pdf = FPDF()
52
  pdf.add_page()
53
  pdf.set_font("Times", size=12)
54
- pdf.multi_cell(190, 10, txt=output_text, align='C')
55
  pdf_output_path = "legal.pdf"
56
  pdf.output(pdf_output_path)
57
 
58
  audio_output_path = "legal.wav"
59
- tts = gTTS(text=output_text, lang='en', slow=False)
60
  tts.save(audio_output_path)
61
 
62
- return audio_output_path, output_text, pdf_output_path
63
 
64
  except Exception as e:
65
  return None, f"An error occurred: {str(e)}", None
@@ -76,7 +98,7 @@ with gr.Blocks() as iface:
76
 
77
  text_input = gr.Textbox(label="Input Text")
78
  file_input = gr.File(label="Upload PDF or DOCX")
79
- slider = gr.Slider(minimum=10, maximum=400, step=10, value=80, label="Summary Minimum Length") # Default value set to 80
80
 
81
  audio_output = gr.Audio(label="Generated Audio")
82
  summary_output = gr.Textbox(label="Generated Summary")
 
14
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
15
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
16
 
17
+ # Function to split text into chunks
18
+ def split_text(text, max_chunk_size=1024):
19
+ sentences = nltk.sent_tokenize(text)
20
+ chunks = []
21
+ chunk = ""
22
+
23
+ for sentence in sentences:
24
+ if len(chunk) + len(sentence) <= max_chunk_size:
25
+ chunk += sentence + " "
26
+ else:
27
+ chunks.append(chunk.strip())
28
+ chunk = sentence + " "
29
+
30
+ if chunk:
31
+ chunks.append(chunk.strip())
32
+
33
+ return chunks
34
+
35
  def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
36
  doc = Document(docx_file)
37
  full_text = []
 
49
  pdf.save()
50
  return output_pdf
51
 
52
+ # Main processing function with text chunking
53
+ def pdf_to_text(text, PDF, min_length=80):
54
  try:
55
  file_extension = os.path.splitext(PDF.name)[1].lower()
56
 
 
60
  elif file_extension == '.pdf' and text == "":
61
  text = extract_text(PDF.name)
62
 
63
+ chunks = split_text(text)
64
+ summarized_text = ""
65
+
66
+ for chunk in chunks:
67
+ inputs = tokenizer([chunk], max_length=1024, truncation=True, return_tensors="pt")
68
+ min_length = int(min_length)
69
+ summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length + 400)
70
+ output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
71
+ summarized_text += output_text + "\n\n"
72
+
73
  pdf = FPDF()
74
  pdf.add_page()
75
  pdf.set_font("Times", size=12)
76
+ pdf.multi_cell(190, 10, txt=summarized_text, align='C')
77
  pdf_output_path = "legal.pdf"
78
  pdf.output(pdf_output_path)
79
 
80
  audio_output_path = "legal.wav"
81
+ tts = gTTS(text=summarized_text, lang='en', slow=False)
82
  tts.save(audio_output_path)
83
 
84
+ return audio_output_path, summarized_text, pdf_output_path
85
 
86
  except Exception as e:
87
  return None, f"An error occurred: {str(e)}", None
 
98
 
99
  text_input = gr.Textbox(label="Input Text")
100
  file_input = gr.File(label="Upload PDF or DOCX")
101
+ slider = gr.Slider(minimum=10, maximum=400, step=10, value=80, label="Summary Minimum Length")
102
 
103
  audio_output = gr.Audio(label="Generated Audio")
104
  summary_output = gr.Textbox(label="Generated Summary")