Spaces:
Runtime error
Runtime error
arithescientist
commited on
Commit
·
0ad143b
1
Parent(s):
6ebd1a5
Update app.py
Browse files
app.py
CHANGED
@@ -13,7 +13,7 @@ import yake
|
|
13 |
from summarizer import Summarizer,TransformerSummarizer
|
14 |
from transformers import pipelines
|
15 |
nltk.download('punkt')
|
16 |
-
from transformers import AutoTokenizer, AutoModelForPreTraining
|
17 |
# model_name = 'distilbert-base-uncased'
|
18 |
model_name = 'nlpaueb/legal-bert-base-uncased'
|
19 |
#model_name = 'laxya007/gpt2_legal'
|
@@ -29,13 +29,52 @@ print('Using model {}\n'.format(model_name))
|
|
29 |
|
30 |
|
31 |
|
32 |
-
def
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
iface = gr.Interface(
|
38 |
-
|
39 |
"text",
|
40 |
"text"
|
41 |
)
|
|
|
13 |
from summarizer import Summarizer,TransformerSummarizer
|
14 |
from transformers import pipelines
|
15 |
nltk.download('punkt')
|
16 |
+
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoConfig
|
17 |
# model_name = 'distilbert-base-uncased'
|
18 |
model_name = 'nlpaueb/legal-bert-base-uncased'
|
19 |
#model_name = 'laxya007/gpt2_legal'
|
|
|
29 |
|
30 |
|
31 |
|
32 |
+
def lincoln(content = input_text):
|
33 |
+
|
34 |
+
|
35 |
+
summary_text = ""
|
36 |
+
for i, paragraph in enumerate(content.split("\n\n")):
|
37 |
+
# get rid of empty paragraphs and one word paras and extra whitespaces
|
38 |
+
paragraph = paragraph.replace('\n',' ')
|
39 |
+
paragraph = paragraph.replace('\t','')
|
40 |
+
paragraph = ' '.join(paragraph.split())
|
41 |
+
# count words in the paragraph and exclude if less than 4 words
|
42 |
+
tokens = word_tokenize(paragraph)
|
43 |
+
# only do real words
|
44 |
+
tokens = [word for word in tokens if word.isalpha()]
|
45 |
+
# print("\nTokens: {}\n".format(len(tokens)))
|
46 |
+
# only do sentences with more than 1 words excl. alpha crap
|
47 |
+
if len(tokens) <= 1:
|
48 |
+
continue
|
49 |
+
# Perhaps also ignore paragraphs with no sentence?
|
50 |
+
sentences = sent_tokenize(paragraph)
|
51 |
+
|
52 |
+
# recreate paragraph from the only words tokens list
|
53 |
+
paragraph = ' '.join(tokens)
|
54 |
+
|
55 |
+
print("\nParagraph:")
|
56 |
+
print(paragraph+"\n")
|
57 |
+
# T5 needs to have 'summarize' in order to work:
|
58 |
+
# text = "summarize:" + paragraph
|
59 |
+
text = paragraph
|
60 |
+
# encoding the input text
|
61 |
+
|
62 |
+
summary = bert_legal_model(content, ratio = 0.01)
|
63 |
+
# summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
|
64 |
+
summary_text += str(summary) + "\n\n"
|
65 |
+
print("Summary:")
|
66 |
+
print(summary)
|
67 |
+
|
68 |
+
summary = bert_legal_model(content, ratio=0.1)
|
69 |
+
|
70 |
+
all_text = str(summary) + "\n\n\n" \
|
71 |
+
+ "-------- The Larger Summary --------\n" + str(summary_text)
|
72 |
+
|
73 |
+
return output_text = all_text
|
74 |
+
|
75 |
|
76 |
iface = gr.Interface(
|
77 |
+
lincoln,
|
78 |
"text",
|
79 |
"text"
|
80 |
)
|