arithescientist commited on
Commit
0ad143b
·
1 Parent(s): 6ebd1a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -6
app.py CHANGED
@@ -13,7 +13,7 @@ import yake
13
  from summarizer import Summarizer,TransformerSummarizer
14
  from transformers import pipelines
15
  nltk.download('punkt')
16
- from transformers import AutoTokenizer, AutoModelForPreTraining
17
  # model_name = 'distilbert-base-uncased'
18
  model_name = 'nlpaueb/legal-bert-base-uncased'
19
  #model_name = 'laxya007/gpt2_legal'
@@ -29,13 +29,52 @@ print('Using model {}\n'.format(model_name))
29
 
30
 
31
 
32
- def get_response(input_text):
33
- output_text= bert_legal_model(input_text, min_length = 8, ratio = 0.05)
34
- return output_text
35
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  iface = gr.Interface(
38
- get_response,
39
  "text",
40
  "text"
41
  )
 
13
  from summarizer import Summarizer,TransformerSummarizer
14
  from transformers import pipelines
15
  nltk.download('punkt')
16
+ from transformers import AutoTokenizer, AutoModelForPreTraining, AutoConfig
17
  # model_name = 'distilbert-base-uncased'
18
  model_name = 'nlpaueb/legal-bert-base-uncased'
19
  #model_name = 'laxya007/gpt2_legal'
 
29
 
30
 
31
 
32
+ def lincoln(content = input_text):
33
+
34
+
35
+ summary_text = ""
36
+ for i, paragraph in enumerate(content.split("\n\n")):
37
+ # get rid of empty paragraphs and one word paras and extra whitespaces
38
+ paragraph = paragraph.replace('\n',' ')
39
+ paragraph = paragraph.replace('\t','')
40
+ paragraph = ' '.join(paragraph.split())
41
+ # count words in the paragraph and exclude if less than 4 words
42
+ tokens = word_tokenize(paragraph)
43
+ # only do real words
44
+ tokens = [word for word in tokens if word.isalpha()]
45
+ # print("\nTokens: {}\n".format(len(tokens)))
46
+ # only do sentences with more than 1 words excl. alpha crap
47
+ if len(tokens) <= 1:
48
+ continue
49
+ # Perhaps also ignore paragraphs with no sentence?
50
+ sentences = sent_tokenize(paragraph)
51
+
52
+ # recreate paragraph from the only words tokens list
53
+ paragraph = ' '.join(tokens)
54
+
55
+ print("\nParagraph:")
56
+ print(paragraph+"\n")
57
+ # T5 needs to have 'summarize' in order to work:
58
+ # text = "summarize:" + paragraph
59
+ text = paragraph
60
+ # encoding the input text
61
+
62
+ summary = bert_legal_model(content, ratio = 0.01)
63
+ # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
64
+ summary_text += str(summary) + "\n\n"
65
+ print("Summary:")
66
+ print(summary)
67
+
68
+ summary = bert_legal_model(content, ratio=0.1)
69
+
70
+ all_text = str(summary) + "\n\n\n" \
71
+ + "-------- The Larger Summary --------\n" + str(summary_text)
72
+
73
+ return output_text = all_text
74
+
75
 
76
  iface = gr.Interface(
77
+ lincoln,
78
  "text",
79
  "text"
80
  )