hughustla commited on
Commit
5a60200
1 Parent(s): c3c27cd

Add application files

Browse files
app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio import inputs
3
+ # from src.text_rank_summarizer import summarize
4
+ from src.transformer_summarization import summarize
5
+
6
+ long_text_input = inputs.Textbox(lines=200, label='Long Text')
7
+ summary_lines = inputs.Number(default=4, label='Summary Lines')
8
+
9
+ interface = gr.Interface(fn=summarize,
10
+ inputs=[long_text_input],
11
+ outputs=['text'],
12
+ live=False,
13
+ layout='horizontal',
14
+ css='css/index.css')
15
+
16
+ if __name__ == '__main__':
17
+ app, local_url, share_url = interface.launch()
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "text_summarisation_demo"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["swhustla <fdkelly@gmail.com>"]
6
+
7
+ [tool.poetry.dependencies]
8
+ python = ">=3.9,<3.11"
9
+ gradio = "pytextrank"
10
+ Jinja2 = "^3.0.3"
11
+ pytextrank = "^3.2.3"
12
+ huggingface = "^0.0.1"
13
+ transformers = {extras = ["pytorch"], version = "^4.17.0"}
14
+ torch = "^1.11.0"
15
+
16
+ [tool.poetry.dev-dependencies]
17
+
18
+ [build-system]
19
+ requires = ["poetry-core>=1.0.0"]
20
+ build-backend = "poetry.core.masonry.api"
src/text_rank_summarizer.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import pytextrank
3
+ from math import sqrt
4
+ from operator import itemgetter
5
+
6
+ nlp = spacy.load('en_core_web_sm')
7
+ nlp.add_pipe('textrank')
8
+
9
+
10
+ def _phrase_vector(doc):
11
+ phrase_id = 0
12
+ unit_vector = []
13
+ sent_bounds = [[s.start, s.end, set([])] for s in doc.sents]
14
+
15
+ for p in doc._.phrases:
16
+ unit_vector.append(p.rank)
17
+ for chunk in p.chunks:
18
+ for sent_start, sent_end, sent_vector in sent_bounds:
19
+ if chunk.start >= sent_start and chunk.end <= sent_end:
20
+ sent_vector.add(phrase_id)
21
+ break
22
+
23
+ phrase_id += 1
24
+
25
+ sum_ranks = sum(unit_vector)
26
+ return [rank / sum_ranks for rank in unit_vector], sent_bounds
27
+
28
+
29
+ def _sent_rank(unit_vector, sent_bounds):
30
+ sent_rank = {}
31
+ sent_id = 0
32
+
33
+ for sent_start, sent_end, sent_vector in sent_bounds:
34
+ sum_sq = 0.0
35
+ for phrase_id in range(len(unit_vector)):
36
+ if phrase_id not in sent_vector:
37
+ sum_sq += unit_vector[phrase_id] ** 2.0
38
+
39
+ sent_rank[sent_id] = sqrt(sum_sq)
40
+ sent_id += 1
41
+ return sent_rank
42
+
43
+
44
+ def _rank_to_summary(sent_rank, doc, summary_lines):
45
+ sent_text = {}
46
+ sent_id = 0
47
+
48
+ for sent in doc.sents:
49
+ sent_text[sent_id] = sent.text
50
+ sent_id += 1
51
+
52
+ summary = []
53
+ num_sent = 0
54
+ for sent_id, _ in sent_rank:
55
+ num_sent += 1
56
+ summary.append(sent_text[sent_id])
57
+ if num_sent == summary_lines:
58
+ break
59
+
60
+ return ' '.join(summary)
61
+
62
+
63
+ def summarize(text, summary_lines):
64
+ doc = nlp(text)
65
+ phrase_vector, sent_bounds = _phrase_vector(doc)
66
+ sent_rank = sorted(_sent_rank(phrase_vector, sent_bounds).items(), key=itemgetter(1))
67
+ return _rank_to_summary(sent_rank, doc, summary_lines)
src/transformer_summarization.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import LongformerTokenizer, EncoderDecoderModel
2
+
3
+ # Load model and tokenizer
4
+ model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16")
5
+ tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
6
+
7
+
8
+ def summarize(text):
9
+ input_ids = tokenizer(text, return_tensors="pt").input_ids
10
+ output_ids = model.generate(input_ids)
11
+ # Get the summary from the output tokens
12
+ return tokenizer.decode(output_ids[0], skip_special_tokens=True)