Spaces:
Runtime error
Runtime error
DeDeckerThomas
commited on
Commit
β’
e4f39c4
1
Parent(s):
a68bc63
Second version
Browse files- README.md +4 -4
- app.py +117 -15
- config.json +9 -8
- pipelines/__pycache__/keyphrase_extraction_pipeline.cpython-39.pyc +0 -0
- pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc +0 -0
- pipelines/keyphrase_extraction_pipeline.py +24 -0
- pipelines/keyphrase_generation_pipeline.py +29 -0
- test.py +13 -0
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
title: Keyphrase Extraction
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
|
|
1 |
---
|
2 |
title: Keyphrase Extraction
|
3 |
+
emoji: π
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: red
|
6 |
sdk: streamlit
|
7 |
+
sdk_version: 1.9.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
app.py
CHANGED
@@ -1,22 +1,34 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
-
from
|
4 |
-
from
|
5 |
import orjson
|
6 |
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
if "config" not in st.session_state:
|
9 |
with open("config.json", "r") as f:
|
10 |
content = f.read()
|
11 |
st.session_state.config = orjson.loads(content)
|
|
|
|
|
12 |
|
13 |
st.set_page_config(
|
14 |
page_icon="π",
|
15 |
page_title="Keyphrase extraction/generation with Transformers",
|
16 |
layout="wide",
|
17 |
-
initial_sidebar_state="auto",
|
18 |
)
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
@st.cache(allow_output_mutation=True)
|
22 |
def load_pipeline(chosen_model):
|
@@ -28,27 +40,117 @@ def load_pipeline(chosen_model):
|
|
28 |
|
29 |
def extract_keyphrases():
|
30 |
st.session_state.keyphrases = pipe(st.session_state.input_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
col1.subheader("Select model")
|
37 |
chosen_model = col1.selectbox(
|
38 |
"Choose your model:",
|
39 |
st.session_state.config.get("models"),
|
40 |
)
|
41 |
st.session_state.chosen_model = chosen_model
|
42 |
|
43 |
-
pipe = load_pipeline(
|
|
|
|
|
44 |
|
45 |
-
|
46 |
-
st.session_state.
|
47 |
-
"Input", st.session_state.config.get("example_text"), height=150
|
48 |
)
|
49 |
-
pressed =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
if
|
52 |
-
col2
|
53 |
-
df = pd.DataFrame(data=st.session_state.keyphrases, columns=["Keyphrases"])
|
54 |
-
col2.table(df)
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
from pipelines.keyphrase_extraction_pipeline import KeyphraseExtractionPipeline
|
4 |
+
from pipelines.keyphrase_generation_pipeline import KeyphraseGenerationPipeline
|
5 |
import orjson
|
6 |
|
7 |
+
from annotated_text.util import get_annotated_html
|
8 |
+
from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
|
9 |
+
import re
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
|
13 |
if "config" not in st.session_state:
|
14 |
with open("config.json", "r") as f:
|
15 |
content = f.read()
|
16 |
st.session_state.config = orjson.loads(content)
|
17 |
+
st.session_state.data_frame = pd.DataFrame(columns=["model"])
|
18 |
+
st.session_state.keyphrases = []
|
19 |
|
20 |
st.set_page_config(
|
21 |
page_icon="π",
|
22 |
page_title="Keyphrase extraction/generation with Transformers",
|
23 |
layout="wide",
|
|
|
24 |
)
|
25 |
|
26 |
+
if "select_rows" not in st.session_state:
|
27 |
+
st.session_state.selected_rows = []
|
28 |
+
|
29 |
+
st.header("π Keyphrase extraction/generation with Transformers")
|
30 |
+
col1, col2 = st.empty().columns(2)
|
31 |
+
|
32 |
|
33 |
@st.cache(allow_output_mutation=True)
|
34 |
def load_pipeline(chosen_model):
|
|
|
40 |
|
41 |
def extract_keyphrases():
|
42 |
st.session_state.keyphrases = pipe(st.session_state.input_text)
|
43 |
+
st.session_state.data_frame = pd.concat(
|
44 |
+
[
|
45 |
+
st.session_state.data_frame,
|
46 |
+
pd.DataFrame(
|
47 |
+
data=[
|
48 |
+
np.concatenate(
|
49 |
+
(
|
50 |
+
[
|
51 |
+
st.session_state.chosen_model,
|
52 |
+
st.session_state.input_text,
|
53 |
+
],
|
54 |
+
st.session_state.keyphrases,
|
55 |
+
)
|
56 |
+
)
|
57 |
+
],
|
58 |
+
columns=["model", "text"]
|
59 |
+
+ [str(i) for i in range(len(st.session_state.keyphrases))],
|
60 |
+
),
|
61 |
+
],
|
62 |
+
ignore_index=True,
|
63 |
+
axis=0,
|
64 |
+
).fillna("")
|
65 |
|
66 |
|
67 |
+
def get_annotated_text(text, keyphrases):
|
68 |
+
for keyphrase in keyphrases:
|
69 |
+
text = re.sub(
|
70 |
+
f"({keyphrase})",
|
71 |
+
keyphrase.replace(" ", "$K"),
|
72 |
+
text,
|
73 |
+
flags=re.I,
|
74 |
+
)
|
75 |
+
|
76 |
+
result = []
|
77 |
+
for i, word in enumerate(text.split(" ")):
|
78 |
+
if re.sub(r"[^\w\s]", "", word) in keyphrases:
|
79 |
+
result.append((word, "KEY", "#21c354"))
|
80 |
+
elif "$K" in word:
|
81 |
+
result.append((" ".join(word.split("$K")), "KEY", "#21c354"))
|
82 |
+
else:
|
83 |
+
if i == len(st.session_state.input_text.split(" ")) - 1:
|
84 |
+
result.append(f" {word}")
|
85 |
+
elif i == 0:
|
86 |
+
result.append(f"{word} ")
|
87 |
+
else:
|
88 |
+
result.append(f" {word} ")
|
89 |
+
return result
|
90 |
+
|
91 |
+
|
92 |
+
def rerender_output(layout):
|
93 |
+
layout.subheader("π§ Output")
|
94 |
+
if (
|
95 |
+
len(st.session_state.keyphrases) > 0
|
96 |
+
and len(st.session_state.selected_rows) == 0
|
97 |
+
):
|
98 |
+
text, keyphrases = st.session_state.input_text, st.session_state.keyphrases
|
99 |
+
else:
|
100 |
+
text, keyphrases = (
|
101 |
+
st.session_state.selected_rows["text"].values[0],
|
102 |
+
[
|
103 |
+
keyphrase
|
104 |
+
for keyphrase in st.session_state.selected_rows.loc[
|
105 |
+
:,
|
106 |
+
st.session_state.selected_rows.columns.difference(
|
107 |
+
["model", "text"]
|
108 |
+
),
|
109 |
+
]
|
110 |
+
.astype(str)
|
111 |
+
.values.tolist()[0]
|
112 |
+
if keyphrase != ""
|
113 |
+
],
|
114 |
+
)
|
115 |
+
|
116 |
+
result = get_annotated_text(text, keyphrases)
|
117 |
+
|
118 |
+
layout.markdown(
|
119 |
+
get_annotated_html(*result),
|
120 |
+
unsafe_allow_html=True,
|
121 |
+
)
|
122 |
+
|
123 |
|
|
|
124 |
chosen_model = col1.selectbox(
|
125 |
"Choose your model:",
|
126 |
st.session_state.config.get("models"),
|
127 |
)
|
128 |
st.session_state.chosen_model = chosen_model
|
129 |
|
130 |
+
pipe = load_pipeline(
|
131 |
+
f"{st.session_state.config.get('model_author')}/{st.session_state.chosen_model}"
|
132 |
+
)
|
133 |
|
134 |
+
st.session_state.input_text = col1.text_area(
|
135 |
+
"Input", st.session_state.config.get("example_text"), height=300
|
|
|
136 |
)
|
137 |
+
pressed = col1.button("Extract", on_click=extract_keyphrases)
|
138 |
+
|
139 |
+
|
140 |
+
if len(st.session_state.data_frame.columns) > 0:
|
141 |
+
st.subheader("π History")
|
142 |
+
builder = GridOptionsBuilder.from_dataframe(
|
143 |
+
st.session_state.data_frame, sortable=False
|
144 |
+
)
|
145 |
+
builder.configure_selection(selection_mode="single", use_checkbox=True)
|
146 |
+
builder.configure_column("text", hide=True)
|
147 |
+
go = builder.build()
|
148 |
+
data = AgGrid(
|
149 |
+
st.session_state.data_frame,
|
150 |
+
gridOptions=go,
|
151 |
+
update_mode=GridUpdateMode.SELECTION_CHANGED,
|
152 |
+
)
|
153 |
+
st.session_state.selected_rows = pd.DataFrame(data["selected_rows"])
|
154 |
|
155 |
+
if len(st.session_state.selected_rows) > 0 or len(st.session_state.keyphrases) > 0:
|
156 |
+
rerender_output(col2)
|
|
|
|
config.json
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
{
|
2 |
"example_text": "Keyphrase extraction is a technique in text analysis where you extract the important keyphrases from a text. Since this is a time-consuming process, Artificial Intelligence is used to automate it. Currently, classical machine learning methods, that use statistics and linguistics, are widely used for the extraction process. The fact that these methods have been widely used in the community has the advantage that there are many easy-to-use libraries. Now with the recent innovations in deep learning methods (such as recurrent neural networks and transformers, GANS, ... ), keyphrase extraction can be improved. These new methods also focus on the semantics and context of a document, which is quite an improvement.",
|
|
|
3 |
"models": [
|
4 |
-
"
|
5 |
-
"
|
6 |
-
"
|
7 |
-
"
|
8 |
-
"
|
9 |
-
"
|
10 |
-
"
|
11 |
-
"
|
12 |
]
|
13 |
}
|
|
|
1 |
{
|
2 |
"example_text": "Keyphrase extraction is a technique in text analysis where you extract the important keyphrases from a text. Since this is a time-consuming process, Artificial Intelligence is used to automate it. Currently, classical machine learning methods, that use statistics and linguistics, are widely used for the extraction process. The fact that these methods have been widely used in the community has the advantage that there are many easy-to-use libraries. Now with the recent innovations in deep learning methods (such as recurrent neural networks and transformers, GANS, ... ), keyphrase extraction can be improved. These new methods also focus on the semantics and context of a document, which is quite an improvement.",
|
3 |
+
"model_author": "DeDeckerThomas",
|
4 |
"models": [
|
5 |
+
"keyphrase-extraction-kbir-inspec",
|
6 |
+
"keyphrase-extraction-distilbert-inspec",
|
7 |
+
"keyphrase-extraction-distilbert-openkp",
|
8 |
+
"keyphrase-extraction-distilbert-kptimes",
|
9 |
+
"keyphrase-extraction-kbir-kpcrowd",
|
10 |
+
"keyphrase-generation-keybart-inspec",
|
11 |
+
"keyphrase-generation-t5-small-inspec",
|
12 |
+
"keyphrase-generation-t5-small-openkp"
|
13 |
]
|
14 |
}
|
pipelines/__pycache__/keyphrase_extraction_pipeline.cpython-39.pyc
ADDED
Binary file (1.31 kB). View file
|
|
pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc
ADDED
Binary file (1.49 kB). View file
|
|
pipelines/keyphrase_extraction_pipeline.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import (
|
2 |
+
TokenClassificationPipeline,
|
3 |
+
AutoModelForTokenClassification,
|
4 |
+
AutoTokenizer,
|
5 |
+
)
|
6 |
+
from transformers.pipelines import AggregationStrategy
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
|
10 |
+
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
|
11 |
+
def __init__(self, model, *args, **kwargs):
|
12 |
+
super().__init__(
|
13 |
+
model=AutoModelForTokenClassification.from_pretrained(model),
|
14 |
+
tokenizer=AutoTokenizer.from_pretrained(model),
|
15 |
+
*args,
|
16 |
+
**kwargs
|
17 |
+
)
|
18 |
+
|
19 |
+
def postprocess(self, model_outputs):
|
20 |
+
results = super().postprocess(
|
21 |
+
model_outputs=model_outputs,
|
22 |
+
aggregation_strategy=AggregationStrategy.SIMPLE,
|
23 |
+
)
|
24 |
+
return np.unique([result.get("word").strip() for result in results])
|
pipelines/keyphrase_generation_pipeline.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import (
|
2 |
+
Text2TextGenerationPipeline,
|
3 |
+
AutoModelForSeq2SeqLM,
|
4 |
+
AutoTokenizer,
|
5 |
+
)
|
6 |
+
|
7 |
+
|
8 |
+
class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
|
9 |
+
def __init__(self, model, keyphrase_sep_token=";", *args, **kwargs):
|
10 |
+
super().__init__(
|
11 |
+
model=AutoModelForSeq2SeqLM.from_pretrained(model),
|
12 |
+
tokenizer=AutoTokenizer.from_pretrained(model),
|
13 |
+
*args,
|
14 |
+
**kwargs
|
15 |
+
)
|
16 |
+
self.keyphrase_sep_token = keyphrase_sep_token
|
17 |
+
|
18 |
+
def postprocess(self, model_outputs):
|
19 |
+
results = super().postprocess(model_outputs=model_outputs)
|
20 |
+
print(results)
|
21 |
+
return [
|
22 |
+
[
|
23 |
+
keyphrase.strip()
|
24 |
+
for keyphrase in result.get("generated_text").split(
|
25 |
+
self.keyphrase_sep_token
|
26 |
+
)
|
27 |
+
]
|
28 |
+
for result in results
|
29 |
+
][0]
|
test.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
2 |
+
from transformers_interpret import MultiLabelClassificationExplainer
|
3 |
+
|
4 |
+
model_name = "j-hartmann/emotion-english-distilroberta-base"
|
5 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
7 |
+
|
8 |
+
|
9 |
+
cls_explainer = MultiLabelClassificationExplainer(model, tokenizer)
|
10 |
+
|
11 |
+
|
12 |
+
word_attributions = cls_explainer("There were many aspects of the film I liked, but it was frightening and gross in parts. My parents hated it.")
|
13 |
+
print(word_attributions)
|