Spaces:

arithescientist
/

lincolnlegal

Running

App Files Files Community

arithescientist commited on Jun 1, 2022

Commit

342a4a2

1 Parent(s): 02a288f

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -263

app.py CHANGED Viewed

@@ -17,275 +17,118 @@ import yake
 from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
 from summarizer import Summarizer,TransformerSummarizer
 from transformers import pipelines
-#nltk.download('punkt')
 print("lets go")
-app = flask.Flask(__name__)
-app.config["DEBUG"] = True
-UPLOAD_FOLDER = './pdfs'
-ALLOWED_EXTENSIONS = {'txt', 'pdf'}
-app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
-#***************** FLASK *****************************
-CORS(app)
-def allowed_file(filename):
-    return '.' in filename and \
-           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
-#model_name = 'laxya007/gpt2_legal'
-#model_name = 'facebook/bart-large-cnn'
-model_name = 'nlpaueb/legal-bert-base-uncased'
-#The setup of huggingface.co
-print("lets go")
-custom_config = AutoConfig.from_pretrained(model_name)
-print("lets go")
-custom_config.output_hidden_states=True
-print("lets go")
-custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
-print("lets go")
-custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
-print("lets go")
-bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
-print('Using model {}\n'.format(model_name))
-# main index page route
-@app.route('/')
-@cross_origin()
-def index():
-    return render_template('index.html')
-@cross_origin()
-@app.route('/results')
-def results():
-    return render_template('results.html')
-@app.route('/predict', methods=['GET', 'POST'])
-def uploads():
-    if request.method == 'GET':
-        # Get the file from post request
-        numsent = int(request.args['number'])
-        text = str(request.args['text'])
-        content = text
-        summary_text = ""
-        for i, paragraph in enumerate(content.split("\n\n")):
-            paragraph = paragraph.replace('\n',' ')
-            paragraph = paragraph.replace('\t','')
-            paragraph = ' '.join(paragraph.split())
-            # count words in the paragraph and exclude if less than 4 words
-            tokens = word_tokenize(paragraph)
-            # only do real words
-            tokens = [word for word in tokens if word.isalpha()]
-            # print("\nTokens: {}\n".format(len(tokens)))
-            # only do sentences with more than 1 words excl. alpha crap
-            if len(tokens) <= 1:
-                continue
-            # Perhaps also ignore paragraphs with no sentence?
-            sentences = sent_tokenize(paragraph)
-            paragraph = ' '.join(tokens)
-            print("\nParagraph:")
-            print(paragraph+"\n")
-            # T5 needs to have 'summarize' in order to work:
-            # text = "summarize:" + paragraph
-            text = paragraph
-            summary = bert_legal_model(text,  min_length = 8, ratio = 0.05)
-            # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
-            summary_text += str(summary) + "\n\n"
-            print("Summary:")
-            print(summary)
-        content2 = content.replace('\n',' ')
-        content2 = content2.replace('\t','')
-        summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
-        # write all to file for inspection and storage
-        all_text = "The Summary-- " + str(summary) + "\n\n\n" \
-            + "The Larger Summary-- " + str(summary_text)
-        all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
-        all_text2 = all_text2.replace('?','.')
-        all_text2 = all_text2.replace('\n',' ')
-        all_text2 = all_text2.replace('..','.')
-        all_text2 = all_text2.replace(',.',',')
-        all_text2 = all_text2.replace('-- ','\n\n\n')
-        pdf = FPDF()
-        # Add a page
-        pdf.add_page()
-        pdf.set_font("Times", size = 12)
-        # open the text file in read mode
-        f = all_text2
-        # insert the texts in pdf
-        pdf.multi_cell(190, 10, txt = f, align = 'C')
-        # save the pdf with name .pdf
-        pdf.output("./static/legal.pdf")
-        all_text
-        return render_template('results.html')
-    return None
-@app.route('/predictpdf', methods=['GET', 'POST'])
-def uploads2():
-    if request.method == 'POST':
-        # Get the file from post request
-        numsent = int(request.args['number'])
-        if 'file' not in request.files:
-            flash('No file part')
-            return redirect(request.url)
-        file = request.files['file']
-        # if user does not select file, browser also
-        # submit an empty part without filename
-        if file.filename == '':
-            flash('No selected file')
-            return redirect(request.url)
-        if file and allowed_file(file.filename):
-            filename = "legal.pdf"
-            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
-        f = request.files['file']
-        f.save(secure_filename(f.filename))
-        path = os.getcwd()
-        folder_name = 'pdfs'
-        path = os.path.join(path, folder_name)
-        list_of_files = []
-        for root, dirs, files in os.walk(path):
-            for file in files:
-                if(file.endswith(".pdf")):
-                    # print(os.path.join(root,file))
-                    list_of_files.append(os.path.join(root,file))
-        print("\nProcessing {} files...\n".format(len(list_of_files)))
-        total_pages = 0
-        for filename in list_of_files:
-            print(filename)
-            file = os.path.splitext(os.path.basename(filename))[0]
-            pages = pdf2image.convert_from_path(pdf_path=filename, dpi=400, size=(1654,2340))
-            total_pages += len(pages)
-            print("\nProcessing the next {} pages...\n".format(len(pages)))
-            # Then save all pages as images and convert them to text except the last page
-            # TODO: create this as a function
-            content = ""
-            dir_name = 'images/' + file + '/'
-            os.makedirs(dir_name, exist_ok=True)
-            # If folder doesn't exist, then create it.
-            for i in range(len(pages)-1):
-                pages[i].save(dir_name + str(i) + '.jpg')
-                # OCR the image using Google's tesseract
-                content += pt.image_to_string(pages[i])
-            summary_text = ""
-            for i, paragraph in enumerate(content.split("\n\n")):
-                paragraph = paragraph.replace('\n',' ')
-                paragraph = paragraph.replace('\t','')
-                paragraph = ' '.join(paragraph.split())
-                # count words in the paragraph and exclude if less than 4 words
-                tokens = word_tokenize(paragraph)
-                # only do real words
-                tokens = [word for word in tokens if word.isalpha()]
-                # print("\nTokens: {}\n".format(len(tokens)))
-                # only do sentences with more than 1 words excl. alpha crap
-                if len(tokens) <= 1:
-                    continue
-                # Perhaps also ignore paragraphs with no sentence?
-                sentences = sent_tokenize(paragraph)
-                paragraph = ' '.join(tokens)
-                print("\nParagraph:")
-                print(paragraph+"\n")
-                # T5 needs to have 'summarize' in order to work:
-                # text = "summarize:" + paragraph
-                text = paragraph
-                summary = bert_legal_model(text,  min_length = 8, ratio = 0.05)
-                # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
-                summary_text += str(summary) + "\n\n"
-                print("Summary:")
-                print(summary)
-            content2 = content.replace('\n',' ')
-            content2 = content2.replace('\t','')
-            summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
-            # write all to file for inspection and storage
-            all_text = "The Summary-- " + str(summary) + "\n\n\n" \
-                + "The Larger Summary-- " + str(summary_text)
-            all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
-            all_text2 = all_text2.replace('?','.')
-            all_text2 = all_text2.replace('\n',' ')
-            all_text2 = all_text2.replace('..','.')
-            all_text2 = all_text2.replace(',.',',')
-            all_text2 = all_text2.replace('-- ','\n\n\n')
-            pdf = FPDF()
-            # Add a page
-            pdf.add_page()
-            pdf.set_font("Times", size = 12)
-            # open the text file in read mode
-            f = all_text2
-            # insert the texts in pdf
-            pdf.multi_cell(190, 10, txt = f, align = 'C')
-            # save the pdf with name .pdf
-            pdf.output("./static/legal.pdf")
-            all_text
-        return render_template('results.html')
-    return None
-import gradio as gr
-iface = gr.Interface(fn=index)
-iface.launch()

 from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
 from summarizer import Summarizer,TransformerSummarizer
 from transformers import pipelines
+nltk.download('punkt')
 print("lets go")
+def pdf(file):
+  #model_name = 'laxya007/gpt2_legal'
+  # model_name = 'facebook/bart-large-cnn'
+  model_name = 'nlpaueb/legal-bert-base-uncased'
+  # The setup of huggingface.co
+  custom_config = AutoConfig.from_pretrained(model_name)
+  custom_config.output_hidden_states=True
+  custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
+  custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
+  bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
+  print('Using model {}\n'.format(model_name))
+  list_of_files = file
+  print("\nProcessing {} files...\n".format(len(list_of_files)))
+  total_pages = 0
+  for filename in list_of_files:
+      print(filename)
+      file = os.path.splitext(os.path.basename(filename))[0]
+      pages = pdf2image.convert_from_path(pdf_path=filename, dpi=400, size=(1654,2340))
+      total_pages += len(pages)
+      print("\nProcessing the next {} pages...\n".format(len(pages)))
+      # Then save all pages as images and convert them to text except the last page
+      # TODO: create this as a function
+      content = ""
+      dir_name = 'images/' + file + '/'
+      os.makedirs(dir_name, exist_ok=True)
+      # If folder doesn't exist, then create it.
+      for i in range(len(pages)-1):
+          pages[i].save(dir_name + str(i) + '.jpg')
+          # OCR the image using Google's tesseract
+          content += pt.image_to_string(pages[i])
+      summary_text = ""
+      for i, paragraph in enumerate(content.split("\n\n")):
+          paragraph = paragraph.replace('\n',' ')
+          paragraph = paragraph.replace('\t','')
+          paragraph = ' '.join(paragraph.split())
+          # count words in the paragraph and exclude if less than 4 words
+          tokens = word_tokenize(paragraph)
+          # only do real words
+          tokens = [word for word in tokens if word.isalpha()]
+          # print("\nTokens: {}\n".format(len(tokens)))
+          # only do sentences with more than 1 words excl. alpha crap
+          if len(tokens) <= 1:
+              continue
+          # Perhaps also ignore paragraphs with no sentence?
+          sentences = sent_tokenize(paragraph)
+          paragraph = ' '.join(tokens)
+          print("\nParagraph:")
+          print(paragraph+"\n")
+          # T5 needs to have 'summarize' in order to work:
+          # text = "summarize:" + paragraph
+          text = paragraph
+          summary = bert_legal_model(text,  min_length = 8, ratio = 0.05)
+          # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
+          summary_text += str(summary) + "\n\n"
+          print("Summary:")
+          print(summary)
+      content2 = content.replace('\n',' ')
+      content2 = content2.replace('\t','')
+      summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
+      # write all to file for inspection and storage
+      all_text = "The Summary-- " + str(summary) + "\n\n\n" \
+          + "The Larger Summary-- " + str(summary_text)
+      all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
+      all_text2 = all_text2.replace('?','.')
+      all_text2 = all_text2.replace('\n',' ')
+      all_text2 = all_text2.replace('..','.')
+      all_text2 = all_text2.replace(',.',',')
+      all_text2 = all_text2.replace('-- ','\n\n\n')
+      pdf = FPDF()
+      # Add a page
+      pdf.add_page()
+      pdf.set_font("Times", size = 12)
+      # open the text file in read mode
+      f = all_text2
+      return f
+import gradio as gr
+iface = gr.Interface(
+    pdf,
+    gr.inputs.Image(shape=(224, 224)),
+    gr.outputs.Label(f),
+    capture_session=True,
+    interpretation="default",
+   )
+if __name__ == "__main__":
+    iface.launch(share=True)