Spaces:

arithescientist
/

lincolnlegal

Sleeping

App Files Files Community

Ari commited on Sep 5

Commit

170c2bc

•

1 Parent(s): 94bf427

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -21

app.py CHANGED Viewed

@@ -12,45 +12,35 @@ nltk.download('punkt')
 tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
-# Function to split the text into smaller chunks
-def split_text(text, chunk_size=1024):
-    words = text.split()
-    for i in range(0, len(words), chunk_size):
-        yield ' '.join(words[i:i + chunk_size])
 # Main processing function
 def pdf_to_text(text, PDF, min_length=20):
     try:
         # Extract text from PDF if no input text provided
         if text == "":
             text = extract_text(PDF.name)
-        # Split the text into chunks for summarization
-        summarized_text = ""
-        for chunk in split_text(text):
-            # Tokenize chunked text
-            inputs = tokenizer([chunk], max_length=1024, return_tensors="pt")
-            min_length = int(min_length)
-            # Generate summary for each chunk
-            summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length+1000)
-            output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
-            summarized_text += output_text + " "  # Append each chunk summary
         # Save summarized text to PDF
         pdf = FPDF()
         pdf.add_page()
         pdf.set_font("Times", size=12)
-        pdf.multi_cell(190, 10, txt=summarized_text, align='C')
         pdf_output_path = "legal.pdf"
         pdf.output(pdf_output_path)
         # Convert summarized text to audio
         audio_output_path = "legal.wav"
-        tts = gTTS(text=summarized_text, lang='en', slow=False)
         tts.save(audio_output_path)
-        return audio_output_path, summarized_text, pdf_output_path
     except Exception as e:
         return None, f"An error occurred: {str(e)}", None
@@ -63,4 +53,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    iface.launch()

 tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
 # Main processing function
 def pdf_to_text(text, PDF, min_length=20):
     try:
         # Extract text from PDF if no input text provided
         if text == "":
             text = extract_text(PDF.name)
+        # Tokenize text
+        inputs = tokenizer([text], max_length=1024, return_tensors="pt")
+        min_length = int(min_length)
+        # Generate summary
+        summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length+1000)
+        output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
         # Save summarized text to PDF
         pdf = FPDF()
         pdf.add_page()
         pdf.set_font("Times", size=12)
+        pdf.multi_cell(190, 10, txt=output_text, align='C')
         pdf_output_path = "legal.pdf"
         pdf.output(pdf_output_path)
         # Convert summarized text to audio
         audio_output_path = "legal.wav"
+        tts = gTTS(text=output_text, lang='en', slow=False)
         tts.save(audio_output_path)
+        return audio_output_path, output_text, pdf_output_path
     except Exception as e:
         return None, f"An error occurred: {str(e)}", None
 )
 if __name__ == "__main__":
+    iface.launch()