Spaces:
Sleeping
Sleeping
Commit
•
1a0701e
1
Parent(s):
ee6c452
precomputation (#19)
Browse files- Add precomputing acl data (44257a91d79656a35a3eac347f3d60c5c66a1ef9)
- Add precomputing acl data (02d7398f2f7c165b2d5110ff7580990b0a4f19f5)
Co-authored-by: Yixi Ding <dyxohjl666@users.noreply.huggingface.co>
- app.py +37 -12
- controlled_summarization.py +87 -16
- requirements.txt +2 -1
app.py
CHANGED
@@ -5,12 +5,13 @@ from reference_string_parsing import *
|
|
5 |
from controlled_summarization import *
|
6 |
from dataset_extraction import *
|
7 |
|
|
|
8 |
import requests
|
9 |
|
10 |
# Example Usage
|
11 |
-
#url = "https://arxiv.org/pdf/2305.14996.pdf"
|
12 |
-
#dest_folder = "./examples/"
|
13 |
-
#download_pdf(url, dest_folder)
|
14 |
|
15 |
|
16 |
with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
|
@@ -31,17 +32,20 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
|
|
31 |
gr.Markdown("* Set the length of text used for summarization. Length 0 will exert no control over length.")
|
32 |
# ctrlsum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
|
33 |
# ctrlsum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
|
34 |
-
ctrlsum_file_length = gr.
|
35 |
-
|
|
|
36 |
with gr.Row():
|
37 |
ctrlsum_file_btn = gr.Button("Generate")
|
38 |
ctrlsum_file_output = gr.Textbox(
|
39 |
elem_id="htext",
|
40 |
label="Summary",
|
41 |
)
|
42 |
-
ctrlsum_file_examples = gr.Examples(
|
43 |
-
|
44 |
-
|
|
|
|
|
45 |
|
46 |
|
47 |
|
@@ -51,13 +55,34 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
|
|
51 |
outputs=[ctrlsum_file_output, ctrlsum_str, ctrlsum_file]
|
52 |
)
|
53 |
def clear():
|
54 |
-
return None,0,None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
ctrlsum_file.upload(clear, inputs=None,outputs=[ctrlsum_str,ctrlsum_file_length,ctrlsum_file_keywords, ctrlsum_url])
|
58 |
-
ctrlsum_url.input(clear, inputs=None, outputs=[ctrlsum_str, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_file])
|
59 |
ctrlsum_str.input(clear, inputs=None,
|
60 |
-
outputs=[ctrlsum_url, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_file])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
# Reference String Parsing
|
62 |
with gr.TabItem("Reference String Parsing"):
|
63 |
gr.Markdown(rsp_title_md)
|
|
|
5 |
from controlled_summarization import *
|
6 |
from dataset_extraction import *
|
7 |
|
8 |
+
from controlled_summarization import recommended_kw
|
9 |
import requests
|
10 |
|
11 |
# Example Usage
|
12 |
+
# url = "https://arxiv.org/pdf/2305.14996.pdf"
|
13 |
+
# dest_folder = "./examples/"
|
14 |
+
# download_pdf(url, dest_folder)
|
15 |
|
16 |
|
17 |
with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
|
|
|
32 |
gr.Markdown("* Set the length of text used for summarization. Length 0 will exert no control over length.")
|
33 |
# ctrlsum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
|
34 |
# ctrlsum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
|
35 |
+
ctrlsum_file_length = gr.Radio(label="Length", value=0, choices=[0, 50, 100, 200])
|
36 |
+
kw = gr.Radio(visible=False)
|
37 |
+
ctrlsum_file_keywords = gr.Textbox(label="Keywords", max_lines=1)
|
38 |
with gr.Row():
|
39 |
ctrlsum_file_btn = gr.Button("Generate")
|
40 |
ctrlsum_file_output = gr.Textbox(
|
41 |
elem_id="htext",
|
42 |
label="Summary",
|
43 |
)
|
44 |
+
ctrlsum_file_examples = gr.Examples(
|
45 |
+
examples=[["examples/H01-1042_body.txt", 50, "automatic evaluation technique", "", ""],
|
46 |
+
["examples/H01-1042.pdf", 0, "automatic evaluation technique", "", ""]],
|
47 |
+
inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str, ctrlsum_url
|
48 |
+
])
|
49 |
|
50 |
|
51 |
|
|
|
55 |
outputs=[ctrlsum_file_output, ctrlsum_str, ctrlsum_file]
|
56 |
)
|
57 |
def clear():
|
58 |
+
return None, 0, None, None, gr.Radio(visible=False)
|
59 |
+
|
60 |
+
|
61 |
+
def update_url(url):
|
62 |
+
if url in recommended_kw.keys():
|
63 |
+
keywords = recommended_kw[url]
|
64 |
+
if keywords != None:
|
65 |
+
return None, None, gr.Radio(choices=keywords[:3], label="Recommended Keywords", visible=True,
|
66 |
+
interactive=True)
|
67 |
|
68 |
+
return None, None, gr.Radio(visible=False)
|
69 |
+
|
70 |
+
|
71 |
+
ctrlsum_file.upload(clear, inputs=None,
|
72 |
+
outputs=[ctrlsum_str, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_url, kw])
|
73 |
+
ctrlsum_url.input(update_url, inputs=ctrlsum_url, outputs=[ctrlsum_str, ctrlsum_file, kw])
|
74 |
|
|
|
|
|
75 |
ctrlsum_str.input(clear, inputs=None,
|
76 |
+
outputs=[ctrlsum_url, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_file, kw])
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
def select_kw(env: gr.SelectData):
|
81 |
+
return env.value
|
82 |
+
|
83 |
+
|
84 |
+
kw.select(select_kw, None, ctrlsum_file_keywords)
|
85 |
+
|
86 |
# Reference String Parsing
|
87 |
with gr.TabItem("Reference String Parsing"):
|
88 |
gr.Markdown(rsp_title_md)
|
controlled_summarization.py
CHANGED
@@ -3,9 +3,64 @@ import torch
|
|
3 |
from SciAssist import Summarization
|
4 |
import os
|
5 |
import requests
|
|
|
|
|
|
|
6 |
device = "gpu" if torch.cuda.is_available() else "cpu"
|
7 |
|
8 |
-
ctrlsum_pipeline = Summarization(os_name="nt",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
def download_pdf(url, dest_folder):
|
@@ -30,16 +85,15 @@ def download_pdf(url, dest_folder):
|
|
30 |
return filename
|
31 |
|
32 |
|
33 |
-
def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
|
34 |
-
|
35 |
if keywords is not None:
|
36 |
keywords = keywords.strip().split(",")
|
37 |
if keywords[0] == "":
|
38 |
keywords = None
|
39 |
-
if length==0 or length is None:
|
40 |
length = None
|
41 |
results = ctrlsum_pipeline.predict(input, type="str",
|
42 |
-
|
43 |
|
44 |
output = []
|
45 |
for res in results["summary"]:
|
@@ -49,31 +103,49 @@ def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
|
|
49 |
|
50 |
def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> List[Tuple[str, str, str]]:
|
51 |
if input == None and url == "":
|
52 |
-
if text=="":
|
53 |
-
return None,"Input cannot be left blank.",None
|
54 |
else:
|
55 |
-
return ctrlsum_for_str(text,length,keywords),text, None
|
56 |
else:
|
57 |
-
filename=""
|
|
|
58 |
if url != "":
|
59 |
-
if len(url) > 4:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
filename = download_pdf(url, './cache/')
|
|
|
|
|
61 |
else:
|
62 |
filename = input.name
|
63 |
if keywords != "":
|
64 |
keywords = keywords.strip().split(",")
|
65 |
if keywords[0] == "":
|
66 |
keywords = None
|
67 |
-
if length==0:
|
68 |
length = None
|
69 |
# Identify the format of input and parse reference strings
|
70 |
if filename[-4:] == ".txt":
|
71 |
results = ctrlsum_pipeline.predict(filename, type="txt",
|
72 |
-
|
73 |
-
|
74 |
elif filename[-4:] == ".pdf":
|
75 |
results = ctrlsum_pipeline.predict(filename,
|
76 |
-
|
77 |
else:
|
78 |
return "File Format Error !", None, filename
|
79 |
|
@@ -83,5 +155,4 @@ def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> L
|
|
83 |
return "".join(output), results["raw_text"], filename
|
84 |
|
85 |
|
86 |
-
|
87 |
-
ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
|
|
|
3 |
from SciAssist import Summarization
|
4 |
import os
|
5 |
import requests
|
6 |
+
from datasets import load_dataset
|
7 |
+
|
8 |
+
acl_data = load_dataset("dyxohjl666/CocoScisum_ACL", revision="refs/convert/parquet")
|
9 |
device = "gpu" if torch.cuda.is_available() else "cpu"
|
10 |
|
11 |
+
ctrlsum_pipeline = Summarization(os_name="nt",device=device)
|
12 |
+
|
13 |
+
acl_dict = {}
|
14 |
+
recommended_kw = {}
|
15 |
+
|
16 |
+
|
17 |
+
def convert_to_dict(data):
|
18 |
+
""" Dict:
|
19 |
+
{ url:
|
20 |
+
{length:
|
21 |
+
{keywords: summary};
|
22 |
+
raw_text:
|
23 |
+
str;
|
24 |
+
}
|
25 |
+
}
|
26 |
+
|
27 |
+
"""
|
28 |
+
url = data["url"]
|
29 |
+
text = data["text"]
|
30 |
+
keywords = data["keywords"]
|
31 |
+
length = data["length"]
|
32 |
+
summary = data["summary"]
|
33 |
+
for u, t, k, l, s in zip(url, text, keywords, length, summary):
|
34 |
+
if len(u) < 5:
|
35 |
+
continue
|
36 |
+
u = u + ".pdf"
|
37 |
+
if k == None:
|
38 |
+
k = ""
|
39 |
+
if l == None:
|
40 |
+
l = ""
|
41 |
+
k = str(k).strip()
|
42 |
+
l = str(l).strip()
|
43 |
+
if u in acl_dict.keys():
|
44 |
+
if k in acl_dict[u][l].keys():
|
45 |
+
continue
|
46 |
+
else:
|
47 |
+
acl_dict[u][l][k] = s
|
48 |
+
else:
|
49 |
+
acl_dict[u] = {"": {}, "50": {}, "100": {}, "200": {}, "raw_text": t}
|
50 |
+
|
51 |
+
# kws
|
52 |
+
if u in recommended_kw.keys():
|
53 |
+
if k == "" or k in recommended_kw[u]:
|
54 |
+
continue
|
55 |
+
else:
|
56 |
+
recommended_kw[u].append(k)
|
57 |
+
else:
|
58 |
+
recommended_kw[u] = []
|
59 |
+
return 1
|
60 |
+
|
61 |
+
|
62 |
+
for i in acl_data.keys():
|
63 |
+
signal = convert_to_dict(acl_data[i])
|
64 |
|
65 |
|
66 |
def download_pdf(url, dest_folder):
|
|
|
85 |
return filename
|
86 |
|
87 |
|
88 |
+
def ctrlsum_for_str(input, length=None, keywords=None) -> List[Tuple[str, str]]:
|
|
|
89 |
if keywords is not None:
|
90 |
keywords = keywords.strip().split(",")
|
91 |
if keywords[0] == "":
|
92 |
keywords = None
|
93 |
+
if length == 0 or length is None:
|
94 |
length = None
|
95 |
results = ctrlsum_pipeline.predict(input, type="str",
|
96 |
+
length=length, keywords=keywords, num_beams=1)
|
97 |
|
98 |
output = []
|
99 |
for res in results["summary"]:
|
|
|
103 |
|
104 |
def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> List[Tuple[str, str, str]]:
|
105 |
if input == None and url == "":
|
106 |
+
if text == "":
|
107 |
+
return None, "Input cannot be left blank.", None
|
108 |
else:
|
109 |
+
return ctrlsum_for_str(text, length, keywords), text, None
|
110 |
else:
|
111 |
+
filename = ""
|
112 |
+
url = url.strip()
|
113 |
if url != "":
|
114 |
+
if len(url) > 4 and url[-3:] == "pdf":
|
115 |
+
if url.strip() in acl_dict.keys():
|
116 |
+
raw_text = acl_dict[url]["raw_text"]
|
117 |
+
l = str(length)
|
118 |
+
if length == 0:
|
119 |
+
l = ""
|
120 |
+
if l in acl_dict[url].keys():
|
121 |
+
if keywords.strip() in acl_dict[url][l].keys():
|
122 |
+
summary = acl_dict[url][l][keywords]
|
123 |
+
return summary, raw_text, None
|
124 |
+
if keywords.strip() == "":
|
125 |
+
keywords = None
|
126 |
+
if l == "":
|
127 |
+
l = None
|
128 |
+
return ctrlsum_for_str(raw_text, l, keywords), raw_text, None
|
129 |
+
|
130 |
filename = download_pdf(url, './cache/')
|
131 |
+
else:
|
132 |
+
"Invalid url(Not PDF)!", None, None
|
133 |
else:
|
134 |
filename = input.name
|
135 |
if keywords != "":
|
136 |
keywords = keywords.strip().split(",")
|
137 |
if keywords[0] == "":
|
138 |
keywords = None
|
139 |
+
if length == 0:
|
140 |
length = None
|
141 |
# Identify the format of input and parse reference strings
|
142 |
if filename[-4:] == ".txt":
|
143 |
results = ctrlsum_pipeline.predict(filename, type="txt",
|
144 |
+
save_results=False,
|
145 |
+
length=length, keywords=keywords, num_beams=1)
|
146 |
elif filename[-4:] == ".pdf":
|
147 |
results = ctrlsum_pipeline.predict(filename,
|
148 |
+
save_results=False, length=length, keywords=keywords, num_beams=1)
|
149 |
else:
|
150 |
return "File Format Error !", None, filename
|
151 |
|
|
|
155 |
return "".join(output), results["raw_text"], filename
|
156 |
|
157 |
|
158 |
+
ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
|
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
pip==23.2.1
|
2 |
torch==1.12.0
|
3 |
-
SciAssist==0.
|
4 |
nltk~=3.7
|
|
|
|
1 |
pip==23.2.1
|
2 |
torch==1.12.0
|
3 |
+
SciAssist==0.1.3
|
4 |
nltk~=3.7
|
5 |
+
pytest
|