Spaces:
Sleeping
Sleeping
Commit
β’
086fdba
1
Parent(s):
e75148e
Add controlled summarization (#3)
Browse files- Add controlled summarization (b723b598f051adf56e95b16e90992eaee5dca0df)
- Delete unimportant files (387bd94d1e8d8c6d587955cb6bde7fdc6495b2f7)
Co-authored-by: Yixi Ding <dyxohjl666@users.noreply.huggingface.co>
- README.md +13 -13
- app.py +161 -111
- bart-large-cnn-e5.pt +0 -3
- controlled_summarization.py +55 -0
- description.py +54 -33
- examples/BERT - Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf +0 -0
- reference_string_parsing.py +36 -36
- requirements.txt +2 -2
- summarization.py +36 -36
README.md
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
-
---
|
2 |
-
title: Test Sciassist
|
3 |
-
emoji: π
|
4 |
-
colorFrom: red
|
5 |
-
colorTo: red
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 3.4
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: afl-3.0
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
---
|
2 |
+
title: Test Sciassist
|
3 |
+
emoji: π
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: red
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.4
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: afl-3.0
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,111 +1,161 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from description import *
|
3 |
-
|
4 |
-
from reference_string_parsing import *
|
5 |
-
from summarization import *
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from description import *
|
3 |
+
|
4 |
+
from reference_string_parsing import *
|
5 |
+
from summarization import *
|
6 |
+
from controlled_summarization import *
|
7 |
+
|
8 |
+
with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
|
9 |
+
gr.Markdown("# Gradio Demo for SciAssist")
|
10 |
+
with gr.Tabs():
|
11 |
+
# Reference String Parsing
|
12 |
+
with gr.TabItem("Reference String Parsing"):
|
13 |
+
with gr.Box():
|
14 |
+
gr.Markdown(rsp_str_md)
|
15 |
+
with gr.Row():
|
16 |
+
with gr.Column():
|
17 |
+
rsp_str = gr.Textbox(label="Input String")
|
18 |
+
with gr.Column():
|
19 |
+
rsp_str_dehyphen = gr.Checkbox(label="dehyphen")
|
20 |
+
with gr.Row():
|
21 |
+
rsp_str_btn = gr.Button("Parse")
|
22 |
+
rsp_str_output = gr.HighlightedText(
|
23 |
+
elem_id="htext",
|
24 |
+
label="The Result of Parsing",
|
25 |
+
combine_adjacent=True,
|
26 |
+
adjacent_separator=" ",
|
27 |
+
)
|
28 |
+
rsp_str_examples = gr.Examples(examples=[[
|
29 |
+
"Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).",
|
30 |
+
True],
|
31 |
+
[
|
32 |
+
"Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval-2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).",
|
33 |
+
False]], inputs=[rsp_str, rsp_str_dehyphen])
|
34 |
+
with gr.Box():
|
35 |
+
gr.Markdown(rsp_file_md)
|
36 |
+
with gr.Row():
|
37 |
+
with gr.Column():
|
38 |
+
rsp_file = gr.File(label="Input File")
|
39 |
+
rsp_file_dehyphen = gr.Checkbox(label="dehyphen")
|
40 |
+
with gr.Row():
|
41 |
+
rsp_file_btn = gr.Button("Parse")
|
42 |
+
|
43 |
+
rsp_file_output = gr.HighlightedText(
|
44 |
+
elem_id="htext",
|
45 |
+
label="The Result of Parsing",
|
46 |
+
combine_adjacent=True,
|
47 |
+
adjacent_separator=" ",
|
48 |
+
)
|
49 |
+
rsp_file_examples = gr.Examples(examples=[["examples/N18-3011_ref.txt", False],["examples/BERT_paper.pdf", True]], inputs=[rsp_file, rsp_file_dehyphen])
|
50 |
+
|
51 |
+
|
52 |
+
rsp_file_btn.click(
|
53 |
+
fn=rsp_for_file,
|
54 |
+
inputs=[rsp_file, rsp_file_dehyphen],
|
55 |
+
outputs=rsp_file_output
|
56 |
+
)
|
57 |
+
rsp_str_btn.click(
|
58 |
+
fn=rsp_for_str,
|
59 |
+
inputs=[rsp_str, rsp_str_dehyphen],
|
60 |
+
outputs=rsp_str_output
|
61 |
+
)
|
62 |
+
|
63 |
+
# Single Document Summarization
|
64 |
+
with gr.TabItem("Summarization"):
|
65 |
+
with gr.Box():
|
66 |
+
gr.Markdown(ssum_str_md)
|
67 |
+
with gr.Row():
|
68 |
+
with gr.Column():
|
69 |
+
ssum_str = gr.Textbox(label="Input String")
|
70 |
+
# with gr.Column():
|
71 |
+
# ssum_str_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
|
72 |
+
# ssum_str_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
|
73 |
+
with gr.Row():
|
74 |
+
ssum_str_btn = gr.Button("Generate")
|
75 |
+
ssum_str_output = gr.Textbox(
|
76 |
+
elem_id="htext",
|
77 |
+
label="Summary",
|
78 |
+
)
|
79 |
+
ssum_str_examples = gr.Examples(examples=[[ssum_str_example], ],
|
80 |
+
inputs=[ssum_str])
|
81 |
+
with gr.Box():
|
82 |
+
gr.Markdown(ssum_file_md)
|
83 |
+
with gr.Row():
|
84 |
+
with gr.Column():
|
85 |
+
ssum_file = gr.File(label="Input File")
|
86 |
+
# with gr.Column():
|
87 |
+
# ssum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
|
88 |
+
# ssum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
|
89 |
+
with gr.Row():
|
90 |
+
ssum_file_btn = gr.Button("Generate")
|
91 |
+
ssum_file_output = gr.Textbox(
|
92 |
+
elem_id="htext",
|
93 |
+
label="Summary",
|
94 |
+
)
|
95 |
+
ssum_file_examples = gr.Examples(examples=[["examples/BERT_body.txt"],["examples/BERT_paper.pdf"]],
|
96 |
+
inputs=[ssum_file])
|
97 |
+
|
98 |
+
ssum_file_btn.click(
|
99 |
+
fn=ssum_for_file,
|
100 |
+
inputs=[ssum_file],
|
101 |
+
outputs=ssum_file_output
|
102 |
+
)
|
103 |
+
ssum_str_btn.click(
|
104 |
+
fn=ssum_for_str,
|
105 |
+
inputs=[ssum_str],
|
106 |
+
outputs=ssum_str_output
|
107 |
+
)
|
108 |
+
|
109 |
+
# Controlled Summarization
|
110 |
+
with gr.TabItem("Controlled Summarization"):
|
111 |
+
with gr.Box():
|
112 |
+
gr.Markdown(ctrlsum_str_md)
|
113 |
+
with gr.Row():
|
114 |
+
with gr.Column():
|
115 |
+
ctrlsum_str = gr.Textbox(label="Input String")
|
116 |
+
with gr.Column():
|
117 |
+
# ctrlsum_str_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
|
118 |
+
# ctrlsum_str_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
|
119 |
+
ctrlsum_str_length = gr.Slider(0, 300, step=50, label="Length")
|
120 |
+
ctrlsum_str_keywords = gr.Textbox(label="Keywords")
|
121 |
+
with gr.Row():
|
122 |
+
ctrlsum_str_btn = gr.Button("Generate")
|
123 |
+
ctrlsum_str_output = gr.Textbox(
|
124 |
+
elem_id="htext",
|
125 |
+
label="Summary",
|
126 |
+
)
|
127 |
+
ctrlsum_str_examples = gr.Examples(examples=[[ssum_str_example, 50, "BERT" ], ],
|
128 |
+
inputs=[ctrlsum_str, ctrlsum_str_length, ctrlsum_str_keywords])
|
129 |
+
with gr.Box():
|
130 |
+
gr.Markdown(ctrlsum_file_md)
|
131 |
+
with gr.Row():
|
132 |
+
with gr.Column():
|
133 |
+
ctrlsum_file = gr.File(label="Input File")
|
134 |
+
with gr.Column():
|
135 |
+
# ctrlsum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
|
136 |
+
# ctrlsum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
|
137 |
+
ctrlsum_file_length = gr.Slider(0,300,step=50, label="Length")
|
138 |
+
ctrlsum_file_keywords = gr.Textbox(label="Keywords")
|
139 |
+
with gr.Row():
|
140 |
+
ctrlsum_file_btn = gr.Button("Generate")
|
141 |
+
ctrlsum_file_output = gr.Textbox(
|
142 |
+
elem_id="htext",
|
143 |
+
label="Summary",
|
144 |
+
)
|
145 |
+
ctrlsum_file_examples = gr.Examples(examples=[["examples/BERT_body.txt", 100, ""],["examples/BERT_paper.pdf", 0, "BERT"]],
|
146 |
+
inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords])
|
147 |
+
|
148 |
+
ctrlsum_file_btn.click(
|
149 |
+
fn=ctrlsum_for_file,
|
150 |
+
inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords],
|
151 |
+
outputs=ctrlsum_file_output
|
152 |
+
)
|
153 |
+
ctrlsum_str_btn.click(
|
154 |
+
fn=ctrlsum_for_str,
|
155 |
+
inputs=[ctrlsum_str, ctrlsum_str_length, ctrlsum_str_keywords],
|
156 |
+
outputs=ctrlsum_str_output
|
157 |
+
)
|
158 |
+
|
159 |
+
|
160 |
+
|
161 |
+
demo.launch(share=True)
|
bart-large-cnn-e5.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4d4aab21eb3b88c4978c54a03214da478828b672d60bff3b0cf8fdfb646f4d66
|
3 |
-
size 1625559041
|
|
|
|
|
|
|
|
controlled_summarization.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Tuple
|
2 |
+
import torch
|
3 |
+
from SciAssist import Summarization
|
4 |
+
|
5 |
+
device = "gpu" if torch.cuda.is_available() else "cpu"
|
6 |
+
|
7 |
+
ctrlsum_pipeline = Summarization(os_name="nt",checkpoint="google/flan-t5-base")
|
8 |
+
|
9 |
+
|
10 |
+
def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
|
11 |
+
|
12 |
+
if keywords is not None:
|
13 |
+
keywords = keywords.strip().split(",")
|
14 |
+
if keywords[0] == "":
|
15 |
+
keywords = None
|
16 |
+
if length==0 or length is None:
|
17 |
+
length = None
|
18 |
+
results = ctrlsum_pipeline.predict(input, type="str",
|
19 |
+
length=length, keywords=keywords)
|
20 |
+
|
21 |
+
output = []
|
22 |
+
for res in results["summary"]:
|
23 |
+
output.append(f"{res}\n\n")
|
24 |
+
return "".join(output)
|
25 |
+
|
26 |
+
|
27 |
+
def ctrlsum_for_file(input, length=None, keywords=None) -> List[Tuple[str, str]]:
|
28 |
+
if input == None:
|
29 |
+
return None
|
30 |
+
filename = input.name
|
31 |
+
if keywords is not None:
|
32 |
+
keywords = keywords.strip().split(",")
|
33 |
+
if keywords[0] == "":
|
34 |
+
keywords = None
|
35 |
+
if length==0:
|
36 |
+
length = None
|
37 |
+
# Identify the format of input and parse reference strings
|
38 |
+
if filename[-4:] == ".txt":
|
39 |
+
results = ctrlsum_pipeline.predict(filename, type="txt",
|
40 |
+
save_results=False,
|
41 |
+
length=length, keywords=keywords)
|
42 |
+
elif filename[-4:] == ".pdf":
|
43 |
+
results = ctrlsum_pipeline.predict(filename,
|
44 |
+
save_results=False, length=length, keywords=keywords)
|
45 |
+
else:
|
46 |
+
return [("File Format Error !", None)]
|
47 |
+
|
48 |
+
output = []
|
49 |
+
for res in results["summary"]:
|
50 |
+
output.append(f"{res}\n\n")
|
51 |
+
return "".join(output)
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : β’ We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . β’ We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . β’ BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
|
description.py
CHANGED
@@ -1,33 +1,54 @@
|
|
1 |
-
# Reference string parsing Markdown
|
2 |
-
rsp_str_md = '''
|
3 |
-
To **test on strings**, simply input one or more strings.
|
4 |
-
'''
|
5 |
-
|
6 |
-
rsp_file_md = '''
|
7 |
-
To **test on a file**, the input can be:
|
8 |
-
|
9 |
-
- A txt file which contains a reference string in each line.
|
10 |
-
|
11 |
-
- A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
|
12 |
-
|
13 |
-
'''
|
14 |
-
# - A pdf file which contains a whole scientific document without any processing (including title, author...).
|
15 |
-
|
16 |
-
ssum_str_md = '''
|
17 |
-
To **test on strings**, simply input a string.
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
'''
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
- A
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Reference string parsing Markdown
|
2 |
+
rsp_str_md = '''
|
3 |
+
To **test on strings**, simply input one or more strings.
|
4 |
+
'''
|
5 |
+
|
6 |
+
rsp_file_md = '''
|
7 |
+
To **test on a file**, the input can be:
|
8 |
+
|
9 |
+
- A txt file which contains a reference string in each line.
|
10 |
+
|
11 |
+
- A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
|
12 |
+
|
13 |
+
'''
|
14 |
+
# - A pdf file which contains a whole scientific document without any processing (including title, author...).
|
15 |
+
|
16 |
+
ssum_str_md = '''
|
17 |
+
To **test on strings**, simply input a string.
|
18 |
+
|
19 |
+
'''
|
20 |
+
|
21 |
+
ssum_file_md = '''
|
22 |
+
To **test on a file**, the input can be:
|
23 |
+
|
24 |
+
- A txt file which contains the content to be summarized.
|
25 |
+
|
26 |
+
- A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
|
27 |
+
|
28 |
+
|
29 |
+
'''
|
30 |
+
|
31 |
+
# - The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
|
32 |
+
ctrlsum_str_md = '''
|
33 |
+
To **test on strings**, simply input a string.
|
34 |
+
|
35 |
+
**Note**:
|
36 |
+
|
37 |
+
- Length 0 will exert no control over length.
|
38 |
+
|
39 |
+
|
40 |
+
'''
|
41 |
+
|
42 |
+
ctrlsum_file_md = '''
|
43 |
+
To **test on a file**, the input can be:
|
44 |
+
|
45 |
+
- A txt file which contains the content to be summarized.
|
46 |
+
|
47 |
+
- A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
|
48 |
+
|
49 |
+
**Note**:
|
50 |
+
|
51 |
+
- Length 0 will exert no control over length.
|
52 |
+
|
53 |
+
|
54 |
+
'''
|
examples/BERT - Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf
ADDED
Binary file (775 kB). View file
|
|
reference_string_parsing.py
CHANGED
@@ -1,36 +1,36 @@
|
|
1 |
-
from typing import List, Tuple
|
2 |
-
import torch
|
3 |
-
from SciAssist import ReferenceStringParsing
|
4 |
-
|
5 |
-
device = "gpu" if torch.cuda.is_available() else "cpu"
|
6 |
-
rsp_pipeline = ReferenceStringParsing(os_name="nt")
|
7 |
-
|
8 |
-
|
9 |
-
def rsp_for_str(input, dehyphen=False) -> List[Tuple[str, str]]:
|
10 |
-
results = rsp_pipeline.predict(input, type="str", dehyphen=dehyphen)
|
11 |
-
output = []
|
12 |
-
for res in results:
|
13 |
-
for token, tag in zip(res["tokens"], res["tags"]):
|
14 |
-
output.append((token, tag))
|
15 |
-
output.append(("\n\n", None))
|
16 |
-
return output
|
17 |
-
|
18 |
-
|
19 |
-
def rsp_for_file(input, dehyphen=False) -> List[Tuple[str, str]]:
|
20 |
-
if input == None:
|
21 |
-
return None
|
22 |
-
filename = input.name
|
23 |
-
# Identify the format of input and parse reference strings
|
24 |
-
if filename[-4:] == ".txt":
|
25 |
-
results = rsp_pipeline.predict(filename, type="txt", dehyphen=dehyphen, save_results=False)
|
26 |
-
elif filename[-4:] == ".pdf":
|
27 |
-
results = rsp_pipeline.predict(filename, dehyphen=dehyphen, save_results=False)
|
28 |
-
else:
|
29 |
-
return [("File Format Error !", None)]
|
30 |
-
# Prepare for the input gradio.HighlightedText accepts.
|
31 |
-
output = []
|
32 |
-
for res in results:
|
33 |
-
for token, tag in zip(res["tokens"], res["tags"]):
|
34 |
-
output.append((token, tag))
|
35 |
-
output.append(("\n\n", None))
|
36 |
-
return output
|
|
|
1 |
+
from typing import List, Tuple
|
2 |
+
import torch
|
3 |
+
from SciAssist import ReferenceStringParsing
|
4 |
+
|
5 |
+
device = "gpu" if torch.cuda.is_available() else "cpu"
|
6 |
+
rsp_pipeline = ReferenceStringParsing(os_name="nt")
|
7 |
+
|
8 |
+
|
9 |
+
def rsp_for_str(input, dehyphen=False) -> List[Tuple[str, str]]:
|
10 |
+
results = rsp_pipeline.predict(input, type="str", dehyphen=dehyphen)
|
11 |
+
output = []
|
12 |
+
for res in results:
|
13 |
+
for token, tag in zip(res["tokens"], res["tags"]):
|
14 |
+
output.append((token, tag))
|
15 |
+
output.append(("\n\n", None))
|
16 |
+
return output
|
17 |
+
|
18 |
+
|
19 |
+
def rsp_for_file(input, dehyphen=False) -> List[Tuple[str, str]]:
|
20 |
+
if input == None:
|
21 |
+
return None
|
22 |
+
filename = input.name
|
23 |
+
# Identify the format of input and parse reference strings
|
24 |
+
if filename[-4:] == ".txt":
|
25 |
+
results = rsp_pipeline.predict(filename, type="txt", dehyphen=dehyphen, save_results=False)
|
26 |
+
elif filename[-4:] == ".pdf":
|
27 |
+
results = rsp_pipeline.predict(filename, dehyphen=dehyphen, save_results=False)
|
28 |
+
else:
|
29 |
+
return [("File Format Error !", None)]
|
30 |
+
# Prepare for the input gradio.HighlightedText accepts.
|
31 |
+
output = []
|
32 |
+
for res in results:
|
33 |
+
for token, tag in zip(res["tokens"], res["tags"]):
|
34 |
+
output.append((token, tag))
|
35 |
+
output.append(("\n\n", None))
|
36 |
+
return output
|
requirements.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
torch==1.12.0
|
2 |
-
SciAssist==0.0.
|
|
|
1 |
+
torch==1.12.0
|
2 |
+
SciAssist==0.0.24
|
summarization.py
CHANGED
@@ -1,37 +1,37 @@
|
|
1 |
-
from typing import List, Tuple
|
2 |
-
import torch
|
3 |
-
from SciAssist import Summarization
|
4 |
-
|
5 |
-
device = "gpu" if torch.cuda.is_available() else "cpu"
|
6 |
-
ssum_pipeline = Summarization(os_name="nt")
|
7 |
-
|
8 |
-
|
9 |
-
def ssum_for_str(input
|
10 |
-
results = ssum_pipeline.predict(input, type="str"
|
11 |
-
|
12 |
-
output = []
|
13 |
-
for res in results["summary"]:
|
14 |
-
output.append(f"{res}\n\n")
|
15 |
-
return "".join(output)
|
16 |
-
|
17 |
-
|
18 |
-
def ssum_for_file(input
|
19 |
-
if input == None:
|
20 |
-
return None
|
21 |
-
filename = input.name
|
22 |
-
# Identify the format of input and parse reference strings
|
23 |
-
if filename[-4:] == ".txt":
|
24 |
-
results = ssum_pipeline.predict(filename, type="txt",
|
25 |
-
|
26 |
-
elif filename[-4:] == ".pdf":
|
27 |
-
results = ssum_pipeline.predict(filename,
|
28 |
-
else:
|
29 |
-
return [("File Format Error !", None)]
|
30 |
-
|
31 |
-
output = []
|
32 |
-
for res in results["summary"]:
|
33 |
-
output.append(f"{res}\n\n")
|
34 |
-
return "".join(output)
|
35 |
-
|
36 |
-
|
37 |
ssum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : β’ We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . β’ We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . β’ BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
|
|
|
1 |
+
from typing import List, Tuple
|
2 |
+
import torch
|
3 |
+
from SciAssist import Summarization
|
4 |
+
|
5 |
+
device = "gpu" if torch.cuda.is_available() else "cpu"
|
6 |
+
ssum_pipeline = Summarization(os_name="nt", checkpoint="google/flan-t5-base")
|
7 |
+
|
8 |
+
|
9 |
+
def ssum_for_str(input) -> List[Tuple[str, str]]:
|
10 |
+
results = ssum_pipeline.predict(input, type="str")
|
11 |
+
|
12 |
+
output = []
|
13 |
+
for res in results["summary"]:
|
14 |
+
output.append(f"{res}\n\n")
|
15 |
+
return "".join(output)
|
16 |
+
|
17 |
+
|
18 |
+
def ssum_for_file(input) -> List[Tuple[str, str]]:
|
19 |
+
if input == None:
|
20 |
+
return None
|
21 |
+
filename = input.name
|
22 |
+
# Identify the format of input and parse reference strings
|
23 |
+
if filename[-4:] == ".txt":
|
24 |
+
results = ssum_pipeline.predict(filename, type="txt",
|
25 |
+
save_results=False)
|
26 |
+
elif filename[-4:] == ".pdf":
|
27 |
+
results = ssum_pipeline.predict(filename, save_results=False)
|
28 |
+
else:
|
29 |
+
return [("File Format Error !", None)]
|
30 |
+
|
31 |
+
output = []
|
32 |
+
for res in results["summary"]:
|
33 |
+
output.append(f"{res}\n\n")
|
34 |
+
return "".join(output)
|
35 |
+
|
36 |
+
|
37 |
ssum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : β’ We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . β’ We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . β’ BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
|