DeDeckerThomas commited on
Commit
e4f39c4
β€’
1 Parent(s): a68bc63

Second version

Browse files
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  title: Keyphrase Extraction
3
- emoji: πŸš€
4
- colorFrom: purple
5
- colorTo: green
6
  sdk: streamlit
7
- sdk_version: 1.2.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
1
  ---
2
  title: Keyphrase Extraction
3
+ emoji: πŸ”‘
4
+ colorFrom: yellow
5
+ colorTo: red
6
  sdk: streamlit
7
+ sdk_version: 1.9.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py CHANGED
@@ -1,22 +1,34 @@
1
  import streamlit as st
2
  import pandas as pd
3
- from extraction.keyphrase_extraction_pipeline import KeyphraseExtractionPipeline
4
- from extraction.keyphrase_generation_pipeline import KeyphraseGenerationPipeline
5
  import orjson
6
 
 
 
 
 
 
7
 
8
  if "config" not in st.session_state:
9
  with open("config.json", "r") as f:
10
  content = f.read()
11
  st.session_state.config = orjson.loads(content)
 
 
12
 
13
  st.set_page_config(
14
  page_icon="πŸ”‘",
15
  page_title="Keyphrase extraction/generation with Transformers",
16
  layout="wide",
17
- initial_sidebar_state="auto",
18
  )
19
 
 
 
 
 
 
 
20
 
21
  @st.cache(allow_output_mutation=True)
22
  def load_pipeline(chosen_model):
@@ -28,27 +40,117 @@ def load_pipeline(chosen_model):
28
 
29
  def extract_keyphrases():
30
  st.session_state.keyphrases = pipe(st.session_state.input_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
 
33
- st.header("πŸ”‘ Keyphrase extraction/generation with Transformers")
34
- col1, col2 = st.columns([2, 3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- col1.subheader("Select model")
37
  chosen_model = col1.selectbox(
38
  "Choose your model:",
39
  st.session_state.config.get("models"),
40
  )
41
  st.session_state.chosen_model = chosen_model
42
 
43
- pipe = load_pipeline(st.session_state.chosen_model)
 
 
44
 
45
- col2.subheader("Input your text")
46
- st.session_state.input_text = col2.text_area(
47
- "Input", st.session_state.config.get("example_text"), height=150
48
  )
49
- pressed = col2.button("Extract", on_click=extract_keyphrases)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- if pressed:
52
- col2.subheader("🐧 Output")
53
- df = pd.DataFrame(data=st.session_state.keyphrases, columns=["Keyphrases"])
54
- col2.table(df)
 
1
  import streamlit as st
2
  import pandas as pd
3
+ from pipelines.keyphrase_extraction_pipeline import KeyphraseExtractionPipeline
4
+ from pipelines.keyphrase_generation_pipeline import KeyphraseGenerationPipeline
5
  import orjson
6
 
7
+ from annotated_text.util import get_annotated_html
8
+ from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
9
+ import re
10
+ import numpy as np
11
+
12
 
13
  if "config" not in st.session_state:
14
  with open("config.json", "r") as f:
15
  content = f.read()
16
  st.session_state.config = orjson.loads(content)
17
+ st.session_state.data_frame = pd.DataFrame(columns=["model"])
18
+ st.session_state.keyphrases = []
19
 
20
  st.set_page_config(
21
  page_icon="πŸ”‘",
22
  page_title="Keyphrase extraction/generation with Transformers",
23
  layout="wide",
 
24
  )
25
 
26
+ if "select_rows" not in st.session_state:
27
+ st.session_state.selected_rows = []
28
+
29
+ st.header("πŸ”‘ Keyphrase extraction/generation with Transformers")
30
+ col1, col2 = st.empty().columns(2)
31
+
32
 
33
  @st.cache(allow_output_mutation=True)
34
  def load_pipeline(chosen_model):
 
40
 
41
  def extract_keyphrases():
42
  st.session_state.keyphrases = pipe(st.session_state.input_text)
43
+ st.session_state.data_frame = pd.concat(
44
+ [
45
+ st.session_state.data_frame,
46
+ pd.DataFrame(
47
+ data=[
48
+ np.concatenate(
49
+ (
50
+ [
51
+ st.session_state.chosen_model,
52
+ st.session_state.input_text,
53
+ ],
54
+ st.session_state.keyphrases,
55
+ )
56
+ )
57
+ ],
58
+ columns=["model", "text"]
59
+ + [str(i) for i in range(len(st.session_state.keyphrases))],
60
+ ),
61
+ ],
62
+ ignore_index=True,
63
+ axis=0,
64
+ ).fillna("")
65
 
66
 
67
+ def get_annotated_text(text, keyphrases):
68
+ for keyphrase in keyphrases:
69
+ text = re.sub(
70
+ f"({keyphrase})",
71
+ keyphrase.replace(" ", "$K"),
72
+ text,
73
+ flags=re.I,
74
+ )
75
+
76
+ result = []
77
+ for i, word in enumerate(text.split(" ")):
78
+ if re.sub(r"[^\w\s]", "", word) in keyphrases:
79
+ result.append((word, "KEY", "#21c354"))
80
+ elif "$K" in word:
81
+ result.append((" ".join(word.split("$K")), "KEY", "#21c354"))
82
+ else:
83
+ if i == len(st.session_state.input_text.split(" ")) - 1:
84
+ result.append(f" {word}")
85
+ elif i == 0:
86
+ result.append(f"{word} ")
87
+ else:
88
+ result.append(f" {word} ")
89
+ return result
90
+
91
+
92
+ def rerender_output(layout):
93
+ layout.subheader("🐧 Output")
94
+ if (
95
+ len(st.session_state.keyphrases) > 0
96
+ and len(st.session_state.selected_rows) == 0
97
+ ):
98
+ text, keyphrases = st.session_state.input_text, st.session_state.keyphrases
99
+ else:
100
+ text, keyphrases = (
101
+ st.session_state.selected_rows["text"].values[0],
102
+ [
103
+ keyphrase
104
+ for keyphrase in st.session_state.selected_rows.loc[
105
+ :,
106
+ st.session_state.selected_rows.columns.difference(
107
+ ["model", "text"]
108
+ ),
109
+ ]
110
+ .astype(str)
111
+ .values.tolist()[0]
112
+ if keyphrase != ""
113
+ ],
114
+ )
115
+
116
+ result = get_annotated_text(text, keyphrases)
117
+
118
+ layout.markdown(
119
+ get_annotated_html(*result),
120
+ unsafe_allow_html=True,
121
+ )
122
+
123
 
 
124
  chosen_model = col1.selectbox(
125
  "Choose your model:",
126
  st.session_state.config.get("models"),
127
  )
128
  st.session_state.chosen_model = chosen_model
129
 
130
+ pipe = load_pipeline(
131
+ f"{st.session_state.config.get('model_author')}/{st.session_state.chosen_model}"
132
+ )
133
 
134
+ st.session_state.input_text = col1.text_area(
135
+ "Input", st.session_state.config.get("example_text"), height=300
 
136
  )
137
+ pressed = col1.button("Extract", on_click=extract_keyphrases)
138
+
139
+
140
+ if len(st.session_state.data_frame.columns) > 0:
141
+ st.subheader("πŸ“œ History")
142
+ builder = GridOptionsBuilder.from_dataframe(
143
+ st.session_state.data_frame, sortable=False
144
+ )
145
+ builder.configure_selection(selection_mode="single", use_checkbox=True)
146
+ builder.configure_column("text", hide=True)
147
+ go = builder.build()
148
+ data = AgGrid(
149
+ st.session_state.data_frame,
150
+ gridOptions=go,
151
+ update_mode=GridUpdateMode.SELECTION_CHANGED,
152
+ )
153
+ st.session_state.selected_rows = pd.DataFrame(data["selected_rows"])
154
 
155
+ if len(st.session_state.selected_rows) > 0 or len(st.session_state.keyphrases) > 0:
156
+ rerender_output(col2)
 
 
config.json CHANGED
@@ -1,13 +1,14 @@
1
  {
2
  "example_text": "Keyphrase extraction is a technique in text analysis where you extract the important keyphrases from a text. Since this is a time-consuming process, Artificial Intelligence is used to automate it. Currently, classical machine learning methods, that use statistics and linguistics, are widely used for the extraction process. The fact that these methods have been widely used in the community has the advantage that there are many easy-to-use libraries. Now with the recent innovations in deep learning methods (such as recurrent neural networks and transformers, GANS, ... ), keyphrase extraction can be improved. These new methods also focus on the semantics and context of a document, which is quite an improvement.",
 
3
  "models": [
4
- "DeDeckerThomas/keyphrase-extraction-kbir-inspec",
5
- "DeDeckerThomas/keyphrase-extraction-distilbert-inspec",
6
- "DeDeckerThomas/keyphrase-extraction-distilbert-openkp",
7
- "DeDeckerThomas/keyphrase-extraction-distilbert-kptimes",
8
- "DeDeckerThomas/keyphrase-extraction-kbir-kpcrowd",
9
- "DeDeckerThomas/keyphrase-generation-keybart-inspec",
10
- "DeDeckerThomas/keyphrase-generation-t5-small-inspec",
11
- "DeDeckerThomas/keyphrase-generation-t5-small-openkp"
12
  ]
13
  }
 
1
  {
2
  "example_text": "Keyphrase extraction is a technique in text analysis where you extract the important keyphrases from a text. Since this is a time-consuming process, Artificial Intelligence is used to automate it. Currently, classical machine learning methods, that use statistics and linguistics, are widely used for the extraction process. The fact that these methods have been widely used in the community has the advantage that there are many easy-to-use libraries. Now with the recent innovations in deep learning methods (such as recurrent neural networks and transformers, GANS, ... ), keyphrase extraction can be improved. These new methods also focus on the semantics and context of a document, which is quite an improvement.",
3
+ "model_author": "DeDeckerThomas",
4
  "models": [
5
+ "keyphrase-extraction-kbir-inspec",
6
+ "keyphrase-extraction-distilbert-inspec",
7
+ "keyphrase-extraction-distilbert-openkp",
8
+ "keyphrase-extraction-distilbert-kptimes",
9
+ "keyphrase-extraction-kbir-kpcrowd",
10
+ "keyphrase-generation-keybart-inspec",
11
+ "keyphrase-generation-t5-small-inspec",
12
+ "keyphrase-generation-t5-small-openkp"
13
  ]
14
  }
pipelines/__pycache__/keyphrase_extraction_pipeline.cpython-39.pyc ADDED
Binary file (1.31 kB). View file
 
pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc ADDED
Binary file (1.49 kB). View file
 
pipelines/keyphrase_extraction_pipeline.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ TokenClassificationPipeline,
3
+ AutoModelForTokenClassification,
4
+ AutoTokenizer,
5
+ )
6
+ from transformers.pipelines import AggregationStrategy
7
+ import numpy as np
8
+
9
+
10
+ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
11
+ def __init__(self, model, *args, **kwargs):
12
+ super().__init__(
13
+ model=AutoModelForTokenClassification.from_pretrained(model),
14
+ tokenizer=AutoTokenizer.from_pretrained(model),
15
+ *args,
16
+ **kwargs
17
+ )
18
+
19
+ def postprocess(self, model_outputs):
20
+ results = super().postprocess(
21
+ model_outputs=model_outputs,
22
+ aggregation_strategy=AggregationStrategy.SIMPLE,
23
+ )
24
+ return np.unique([result.get("word").strip() for result in results])
pipelines/keyphrase_generation_pipeline.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ Text2TextGenerationPipeline,
3
+ AutoModelForSeq2SeqLM,
4
+ AutoTokenizer,
5
+ )
6
+
7
+
8
+ class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
9
+ def __init__(self, model, keyphrase_sep_token=";", *args, **kwargs):
10
+ super().__init__(
11
+ model=AutoModelForSeq2SeqLM.from_pretrained(model),
12
+ tokenizer=AutoTokenizer.from_pretrained(model),
13
+ *args,
14
+ **kwargs
15
+ )
16
+ self.keyphrase_sep_token = keyphrase_sep_token
17
+
18
+ def postprocess(self, model_outputs):
19
+ results = super().postprocess(model_outputs=model_outputs)
20
+ print(results)
21
+ return [
22
+ [
23
+ keyphrase.strip()
24
+ for keyphrase in result.get("generated_text").split(
25
+ self.keyphrase_sep_token
26
+ )
27
+ ]
28
+ for result in results
29
+ ][0]
test.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
2
+ from transformers_interpret import MultiLabelClassificationExplainer
3
+
4
+ model_name = "j-hartmann/emotion-english-distilroberta-base"
5
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
6
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
7
+
8
+
9
+ cls_explainer = MultiLabelClassificationExplainer(model, tokenizer)
10
+
11
+
12
+ word_attributions = cls_explainer("There were many aspects of the film I liked, but it was frightening and gross in parts. My parents hated it.")
13
+ print(word_attributions)