Spaces:
Sleeping
Sleeping
arithescientist
commited on
Commit
·
90cc1ec
1
Parent(s):
5957677
Update app.py
Browse files
app.py
CHANGED
@@ -16,22 +16,19 @@ from summarizer import Summarizer,TransformerSummarizer
|
|
16 |
from transformers import pipelines
|
17 |
nltk.download('punkt')
|
18 |
|
19 |
-
print("lets go")
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
25 |
|
26 |
-
# The setup of huggingface.co
|
27 |
-
custom_config = AutoConfig.from_pretrained(model_name)
|
28 |
-
custom_config.output_hidden_states=True
|
29 |
-
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
30 |
-
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
|
31 |
-
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
|
32 |
|
33 |
-
|
34 |
|
|
|
35 |
content = ""
|
36 |
dir_name = 'images/' + file + '/'
|
37 |
os.makedirs(dir_name, exist_ok=True)
|
@@ -41,74 +38,30 @@ def pdf(file):
|
|
41 |
# OCR the image using Google's tesseract
|
42 |
content += pt.image_to_string(pages[i])
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
tokens = word_tokenize(paragraph)
|
52 |
-
# only do real words
|
53 |
-
tokens = [word for word in tokens if word.isalpha()]
|
54 |
-
# print("\nTokens: {}\n".format(len(tokens)))
|
55 |
-
# only do sentences with more than 1 words excl. alpha crap
|
56 |
-
if len(tokens) <= 1:
|
57 |
-
continue
|
58 |
-
# Perhaps also ignore paragraphs with no sentence?
|
59 |
-
sentences = sent_tokenize(paragraph)
|
60 |
-
|
61 |
-
paragraph = ' '.join(tokens)
|
62 |
-
|
63 |
-
print("\nParagraph:")
|
64 |
-
print(paragraph+"\n")
|
65 |
-
# T5 needs to have 'summarize' in order to work:
|
66 |
-
# text = "summarize:" + paragraph
|
67 |
-
text = paragraph
|
68 |
-
|
69 |
-
summary = bert_legal_model(text, min_length = 8, ratio = 0.05)
|
70 |
-
# summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
|
71 |
-
summary_text += str(summary) + "\n\n"
|
72 |
-
print("Summary:")
|
73 |
-
print(summary)
|
74 |
-
|
75 |
-
content2 = content.replace('\n',' ')
|
76 |
-
content2 = content2.replace('\t','')
|
77 |
-
summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
# write all to file for inspection and storage
|
82 |
-
all_text = "The Summary-- " + str(summary) + "\n\n\n" \
|
83 |
-
+ "The Larger Summary-- " + str(summary_text)
|
84 |
-
|
85 |
-
|
86 |
-
all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
|
87 |
-
all_text2 = all_text2.replace('?','.')
|
88 |
-
all_text2 = all_text2.replace('\n',' ')
|
89 |
-
all_text2 = all_text2.replace('..','.')
|
90 |
-
all_text2 = all_text2.replace(',.',',')
|
91 |
-
all_text2 = all_text2.replace('-- ','\n\n\n')
|
92 |
-
|
93 |
-
pdf = FPDF()
|
94 |
-
|
95 |
# Add a page
|
96 |
pdf.add_page()
|
97 |
-
|
98 |
pdf.set_font("Times", size = 12)
|
99 |
-
|
100 |
# open the text file in read mode
|
101 |
f = all_text2
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
|
107 |
|
108 |
iface = gr.Interface(
|
109 |
pdf,
|
110 |
"file",
|
111 |
-
"
|
112 |
)
|
113 |
|
114 |
if __name__ == "__main__":
|
|
|
16 |
from transformers import pipelines
|
17 |
nltk.download('punkt')
|
18 |
|
|
|
19 |
|
20 |
+
model_name = 'nlpaueb/legal-bert-base-uncased'
|
21 |
+
# The setup of huggingface.co
|
22 |
+
custom_config = AutoConfig.from_pretrained(model_name)
|
23 |
+
custom_config.output_hidden_states=True
|
24 |
+
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
25 |
+
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
|
26 |
+
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
+
def pdf(file):
|
30 |
|
31 |
+
pages= pdf2image.convert_from_path(pdf_path=file, dpi=400, size=(1654,2340))
|
32 |
content = ""
|
33 |
dir_name = 'images/' + file + '/'
|
34 |
os.makedirs(dir_name, exist_ok=True)
|
|
|
38 |
# OCR the image using Google's tesseract
|
39 |
content += pt.image_to_string(pages[i])
|
40 |
|
41 |
+
output_text= bert_legal_model(content, min_length = 8, ratio = 0.05)
|
42 |
+
output_text = output_text.replace(' ',' ')
|
43 |
+
output_text = output_text .replace(',.',',')
|
44 |
+
output_text = output_text .replace('\n',' ')
|
45 |
+
output_text = output_text .replace('..','.')
|
46 |
+
|
47 |
+
pdf = FPDF()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# Add a page
|
50 |
pdf.add_page()
|
|
|
51 |
pdf.set_font("Times", size = 12)
|
52 |
+
|
53 |
# open the text file in read mode
|
54 |
f = all_text2
|
55 |
+
|
56 |
+
|
57 |
+
return pdf.output("legal.pdf")
|
58 |
+
|
59 |
|
60 |
|
61 |
iface = gr.Interface(
|
62 |
pdf,
|
63 |
"file",
|
64 |
+
"file"
|
65 |
)
|
66 |
|
67 |
if __name__ == "__main__":
|