Spaces:

arithescientist
/

lincolnlegal

Sleeping

App Files Files Community

arithescientist commited on Jun 1, 2022

Commit

90cc1ec

1 Parent(s): 5957677

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -69

app.py CHANGED Viewed

@@ -16,22 +16,19 @@ from summarizer import Summarizer,TransformerSummarizer
 from transformers import pipelines
 nltk.download('punkt')
-print("lets go")
-def pdf(file):
-  #model_name = 'laxya007/gpt2_legal'
-  # model_name = 'facebook/bart-large-cnn'
-  model_name = 'nlpaueb/legal-bert-base-uncased'
-  # The setup of huggingface.co
-  custom_config = AutoConfig.from_pretrained(model_name)
-  custom_config.output_hidden_states=True
-  custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
-  custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
-  bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
-  pages= pdf2image.convert_from_path(pdf_path=file, dpi=400, size=(1654,2340))
   content = ""
   dir_name = 'images/' + file + '/'
   os.makedirs(dir_name, exist_ok=True)
@@ -41,74 +38,30 @@ def pdf(file):
       # OCR the image using Google's tesseract
       content += pt.image_to_string(pages[i])
-  summary_text = ""
-  for i, paragraph in enumerate(content.split("\n\n")):
-      paragraph = paragraph.replace('\n',' ')
-      paragraph = paragraph.replace('\t','')
-      paragraph = ' '.join(paragraph.split())
-      # count words in the paragraph and exclude if less than 4 words
-      tokens = word_tokenize(paragraph)
-      # only do real words
-      tokens = [word for word in tokens if word.isalpha()]
-      # print("\nTokens: {}\n".format(len(tokens)))
-      # only do sentences with more than 1 words excl. alpha crap
-      if len(tokens) <= 1:
-          continue
-      # Perhaps also ignore paragraphs with no sentence?
-      sentences = sent_tokenize(paragraph)
-      paragraph = ' '.join(tokens)
-      print("\nParagraph:")
-      print(paragraph+"\n")
-      # T5 needs to have 'summarize' in order to work:
-      # text = "summarize:" + paragraph
-      text = paragraph
-      summary = bert_legal_model(text,  min_length = 8, ratio = 0.05)
-          # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
-      summary_text += str(summary) + "\n\n"
-      print("Summary:")
-      print(summary)
-  content2 = content.replace('\n',' ')
-  content2 = content2.replace('\t','')
-  summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
-  # write all to file for inspection and storage
-  all_text = "The Summary-- " + str(summary) + "\n\n\n" \
-      + "The Larger Summary-- " + str(summary_text)
-  all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
-  all_text2 = all_text2.replace('?','.')
-  all_text2 = all_text2.replace('\n',' ')
-  all_text2 = all_text2.replace('..','.')
-  all_text2 = all_text2.replace(',.',',')
-  all_text2 = all_text2.replace('-- ','\n\n\n')
-  pdf = FPDF()
   # Add a page
   pdf.add_page()
   pdf.set_font("Times", size = 12)
   # open the text file in read mode
   f = all_text2
-  return f
 iface = gr.Interface(
     pdf,
     "file",
-    "text"
    )
 if __name__ == "__main__":

 from transformers import pipelines
 nltk.download('punkt')
+model_name = 'nlpaueb/legal-bert-base-uncased'
+# The setup of huggingface.co
+custom_config = AutoConfig.from_pretrained(model_name)
+custom_config.output_hidden_states=True
+custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
+custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
+bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
+def pdf(file):
+  pages= pdf2image.convert_from_path(pdf_path=file, dpi=400, size=(1654,2340))
   content = ""
   dir_name = 'images/' + file + '/'
   os.makedirs(dir_name, exist_ok=True)
       # OCR the image using Google's tesseract
       content += pt.image_to_string(pages[i])
+  output_text= bert_legal_model(content,  min_length = 8, ratio = 0.05)
+  output_text = output_text.replace('   ',' ')
+  output_text = output_text .replace(',.',',')
+  output_text = output_text .replace('\n','  ')
+  output_text = output_text .replace('..','.')
+  pdf = FPDF()
   # Add a page
   pdf.add_page()
   pdf.set_font("Times", size = 12)
   # open the text file in read mode
   f = all_text2
+  return pdf.output("legal.pdf")
 iface = gr.Interface(
     pdf,
     "file",
+    "file"
    )
 if __name__ == "__main__":