Ari commited on
Commit
170c2bc
1 Parent(s): 94bf427

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -21
app.py CHANGED
@@ -12,45 +12,35 @@ nltk.download('punkt')
12
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
13
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
14
 
15
- # Function to split the text into smaller chunks
16
- def split_text(text, chunk_size=1024):
17
- words = text.split()
18
- for i in range(0, len(words), chunk_size):
19
- yield ' '.join(words[i:i + chunk_size])
20
-
21
  # Main processing function
22
  def pdf_to_text(text, PDF, min_length=20):
23
  try:
24
  # Extract text from PDF if no input text provided
25
  if text == "":
26
  text = extract_text(PDF.name)
 
 
 
 
27
 
28
- # Split the text into chunks for summarization
29
- summarized_text = ""
30
- for chunk in split_text(text):
31
- # Tokenize chunked text
32
- inputs = tokenizer([chunk], max_length=1024, return_tensors="pt")
33
- min_length = int(min_length)
34
-
35
- # Generate summary for each chunk
36
- summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length+1000)
37
- output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
38
- summarized_text += output_text + " " # Append each chunk summary
39
 
40
  # Save summarized text to PDF
41
  pdf = FPDF()
42
  pdf.add_page()
43
  pdf.set_font("Times", size=12)
44
- pdf.multi_cell(190, 10, txt=summarized_text, align='C')
45
  pdf_output_path = "legal.pdf"
46
  pdf.output(pdf_output_path)
47
 
48
  # Convert summarized text to audio
49
  audio_output_path = "legal.wav"
50
- tts = gTTS(text=summarized_text, lang='en', slow=False)
51
  tts.save(audio_output_path)
52
 
53
- return audio_output_path, summarized_text, pdf_output_path
54
 
55
  except Exception as e:
56
  return None, f"An error occurred: {str(e)}", None
@@ -63,4 +53,4 @@ iface = gr.Interface(
63
  )
64
 
65
  if __name__ == "__main__":
66
- iface.launch()
 
12
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
13
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
14
 
 
 
 
 
 
 
15
  # Main processing function
16
  def pdf_to_text(text, PDF, min_length=20):
17
  try:
18
  # Extract text from PDF if no input text provided
19
  if text == "":
20
  text = extract_text(PDF.name)
21
+
22
+ # Tokenize text
23
+ inputs = tokenizer([text], max_length=1024, return_tensors="pt")
24
+ min_length = int(min_length)
25
 
26
+ # Generate summary
27
+ summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length+1000)
28
+ output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
 
 
 
 
 
 
 
 
29
 
30
  # Save summarized text to PDF
31
  pdf = FPDF()
32
  pdf.add_page()
33
  pdf.set_font("Times", size=12)
34
+ pdf.multi_cell(190, 10, txt=output_text, align='C')
35
  pdf_output_path = "legal.pdf"
36
  pdf.output(pdf_output_path)
37
 
38
  # Convert summarized text to audio
39
  audio_output_path = "legal.wav"
40
+ tts = gTTS(text=output_text, lang='en', slow=False)
41
  tts.save(audio_output_path)
42
 
43
+ return audio_output_path, output_text, pdf_output_path
44
 
45
  except Exception as e:
46
  return None, f"An error occurred: {str(e)}", None
 
53
  )
54
 
55
  if __name__ == "__main__":
56
+ iface.launch()