Spaces:

ml6team
/

keyphrase-extraction

Runtime error

App Files Files Community

DeDeckerThomas commited on May 12, 2022

Commit

e4f39c4

1 Parent(s): a68bc63

Second version

Browse files

Files changed (8) hide show

README.md +4 -4
app.py +117 -15
config.json +9 -8
pipelines/__pycache__/keyphrase_extraction_pipeline.cpython-39.pyc +0 -0
pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc +0 -0
pipelines/keyphrase_extraction_pipeline.py +24 -0
pipelines/keyphrase_generation_pipeline.py +29 -0
test.py +13 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: Keyphrase Extraction
-emoji: 🚀
-colorFrom: purple
-colorTo: green
 sdk: streamlit
-sdk_version: 1.2.0
 app_file: app.py
 pinned: false
 license: mit

 ---
 title: Keyphrase Extraction
+emoji: 🔑
+colorFrom: yellow
+colorTo: red
 sdk: streamlit
+sdk_version: 1.9.0
 app_file: app.py
 pinned: false
 license: mit

app.py CHANGED Viewed

@@ -1,22 +1,34 @@
 import streamlit as st
 import pandas as pd
-from extraction.keyphrase_extraction_pipeline import KeyphraseExtractionPipeline
-from extraction.keyphrase_generation_pipeline import KeyphraseGenerationPipeline
 import orjson
 if "config" not in st.session_state:
     with open("config.json", "r") as f:
         content = f.read()
     st.session_state.config = orjson.loads(content)
 st.set_page_config(
     page_icon="🔑",
     page_title="Keyphrase extraction/generation with Transformers",
     layout="wide",
-    initial_sidebar_state="auto",
 )
 @st.cache(allow_output_mutation=True)
 def load_pipeline(chosen_model):
@@ -28,27 +40,117 @@ def load_pipeline(chosen_model):
 def extract_keyphrases():
     st.session_state.keyphrases = pipe(st.session_state.input_text)
-st.header("🔑 Keyphrase extraction/generation with Transformers")
-col1, col2 = st.columns([2, 3])
-col1.subheader("Select model")
 chosen_model = col1.selectbox(
     "Choose your model:",
     st.session_state.config.get("models"),
 )
 st.session_state.chosen_model = chosen_model
-pipe = load_pipeline(st.session_state.chosen_model)
-col2.subheader("Input your text")
-st.session_state.input_text = col2.text_area(
-    "Input", st.session_state.config.get("example_text"), height=150
 )
-pressed = col2.button("Extract", on_click=extract_keyphrases)
-if pressed:
-    col2.subheader("🐧 Output")
-    df = pd.DataFrame(data=st.session_state.keyphrases, columns=["Keyphrases"])
-    col2.table(df)

 import streamlit as st
 import pandas as pd
+from pipelines.keyphrase_extraction_pipeline import KeyphraseExtractionPipeline
+from pipelines.keyphrase_generation_pipeline import KeyphraseGenerationPipeline
 import orjson
+from annotated_text.util import get_annotated_html
+from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
+import re
+import numpy as np
 if "config" not in st.session_state:
     with open("config.json", "r") as f:
         content = f.read()
     st.session_state.config = orjson.loads(content)
+    st.session_state.data_frame = pd.DataFrame(columns=["model"])
+    st.session_state.keyphrases = []
 st.set_page_config(
     page_icon="🔑",
     page_title="Keyphrase extraction/generation with Transformers",
     layout="wide",
 )
+if "select_rows" not in st.session_state:
+    st.session_state.selected_rows = []
+st.header("🔑 Keyphrase extraction/generation with Transformers")
+col1, col2 = st.empty().columns(2)
 @st.cache(allow_output_mutation=True)
 def load_pipeline(chosen_model):
 def extract_keyphrases():
     st.session_state.keyphrases = pipe(st.session_state.input_text)
+    st.session_state.data_frame = pd.concat(
+        [
+            st.session_state.data_frame,
+            pd.DataFrame(
+                data=[
+                    np.concatenate(
+                        (
+                            [
+                                st.session_state.chosen_model,
+                                st.session_state.input_text,
+                            ],
+                            st.session_state.keyphrases,
+                        )
+                    )
+                ],
+                columns=["model", "text"]
+                + [str(i) for i in range(len(st.session_state.keyphrases))],
+            ),
+        ],
+        ignore_index=True,
+        axis=0,
+    ).fillna("")
+def get_annotated_text(text, keyphrases):
+    for keyphrase in keyphrases:
+        text = re.sub(
+            f"({keyphrase})",
+            keyphrase.replace(" ", "$K"),
+            text,
+            flags=re.I,
+        )
+    result = []
+    for i, word in enumerate(text.split(" ")):
+        if re.sub(r"[^\w\s]", "", word) in keyphrases:
+            result.append((word, "KEY", "#21c354"))
+        elif "$K" in word:
+            result.append((" ".join(word.split("$K")), "KEY", "#21c354"))
+        else:
+            if i == len(st.session_state.input_text.split(" ")) - 1:
+                result.append(f" {word}")
+            elif i == 0:
+                result.append(f"{word} ")
+            else:
+                result.append(f" {word} ")
+    return result
+def rerender_output(layout):
+    layout.subheader("🐧 Output")
+    if (
+        len(st.session_state.keyphrases) > 0
+        and len(st.session_state.selected_rows) == 0
+    ):
+        text, keyphrases = st.session_state.input_text, st.session_state.keyphrases
+    else:
+        text, keyphrases = (
+            st.session_state.selected_rows["text"].values[0],
+            [
+                keyphrase
+                for keyphrase in st.session_state.selected_rows.loc[
+                    :,
+                    st.session_state.selected_rows.columns.difference(
+                        ["model", "text"]
+                    ),
+                ]
+                .astype(str)
+                .values.tolist()[0]
+                if keyphrase != ""
+            ],
+        )
+    result = get_annotated_text(text, keyphrases)
+    layout.markdown(
+        get_annotated_html(*result),
+        unsafe_allow_html=True,
+    )
 chosen_model = col1.selectbox(
     "Choose your model:",
     st.session_state.config.get("models"),
 )
 st.session_state.chosen_model = chosen_model
+pipe = load_pipeline(
+    f"{st.session_state.config.get('model_author')}/{st.session_state.chosen_model}"
+)
+st.session_state.input_text = col1.text_area(
+    "Input", st.session_state.config.get("example_text"), height=300
 )
+pressed = col1.button("Extract", on_click=extract_keyphrases)
+if len(st.session_state.data_frame.columns) > 0:
+    st.subheader("📜 History")
+    builder = GridOptionsBuilder.from_dataframe(
+        st.session_state.data_frame, sortable=False
+    )
+    builder.configure_selection(selection_mode="single", use_checkbox=True)
+    builder.configure_column("text", hide=True)
+    go = builder.build()
+    data = AgGrid(
+        st.session_state.data_frame,
+        gridOptions=go,
+        update_mode=GridUpdateMode.SELECTION_CHANGED,
+    )
+    st.session_state.selected_rows = pd.DataFrame(data["selected_rows"])
+if len(st.session_state.selected_rows) > 0 or len(st.session_state.keyphrases) > 0:
+    rerender_output(col2)

config.json CHANGED Viewed

@@ -1,13 +1,14 @@
 {
   "example_text": "Keyphrase extraction is a technique in text analysis where you extract the important keyphrases from a text. Since this is a time-consuming process, Artificial Intelligence is used to automate it. Currently, classical machine learning methods, that use statistics and linguistics, are widely used for the extraction process. The fact that these methods have been widely used in the community has the advantage that there are many easy-to-use libraries. Now with the recent innovations in deep learning methods (such as recurrent neural networks and transformers, GANS, ... ), keyphrase extraction can be improved. These new methods also focus on the semantics and context of a document, which is quite an improvement.",
   "models": [
-    "DeDeckerThomas/keyphrase-extraction-kbir-inspec",
-    "DeDeckerThomas/keyphrase-extraction-distilbert-inspec",
-    "DeDeckerThomas/keyphrase-extraction-distilbert-openkp",
-    "DeDeckerThomas/keyphrase-extraction-distilbert-kptimes",
-    "DeDeckerThomas/keyphrase-extraction-kbir-kpcrowd",
-    "DeDeckerThomas/keyphrase-generation-keybart-inspec",
-    "DeDeckerThomas/keyphrase-generation-t5-small-inspec",
-    "DeDeckerThomas/keyphrase-generation-t5-small-openkp"
   ]
 }

 {
   "example_text": "Keyphrase extraction is a technique in text analysis where you extract the important keyphrases from a text. Since this is a time-consuming process, Artificial Intelligence is used to automate it. Currently, classical machine learning methods, that use statistics and linguistics, are widely used for the extraction process. The fact that these methods have been widely used in the community has the advantage that there are many easy-to-use libraries. Now with the recent innovations in deep learning methods (such as recurrent neural networks and transformers, GANS, ... ), keyphrase extraction can be improved. These new methods also focus on the semantics and context of a document, which is quite an improvement.",
+  "model_author": "DeDeckerThomas",
   "models": [
+    "keyphrase-extraction-kbir-inspec",
+    "keyphrase-extraction-distilbert-inspec",
+    "keyphrase-extraction-distilbert-openkp",
+    "keyphrase-extraction-distilbert-kptimes",
+    "keyphrase-extraction-kbir-kpcrowd",
+    "keyphrase-generation-keybart-inspec",
+    "keyphrase-generation-t5-small-inspec",
+    "keyphrase-generation-t5-small-openkp"
   ]
 }

pipelines/__pycache__/keyphrase_extraction_pipeline.cpython-39.pyc ADDED Viewed

Binary file (1.31 kB). View file

pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc ADDED Viewed

Binary file (1.49 kB). View file

pipelines/keyphrase_extraction_pipeline.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from transformers import (
+    TokenClassificationPipeline,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+)
+from transformers.pipelines import AggregationStrategy
+import numpy as np
+class KeyphraseExtractionPipeline(TokenClassificationPipeline):
+    def __init__(self, model, *args, **kwargs):
+        super().__init__(
+            model=AutoModelForTokenClassification.from_pretrained(model),
+            tokenizer=AutoTokenizer.from_pretrained(model),
+            *args,
+            **kwargs
+        )
+    def postprocess(self, model_outputs):
+        results = super().postprocess(
+            model_outputs=model_outputs,
+            aggregation_strategy=AggregationStrategy.SIMPLE,
+        )
+        return np.unique([result.get("word").strip() for result in results])

pipelines/keyphrase_generation_pipeline.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from transformers import (
+    Text2TextGenerationPipeline,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+)
+class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
+    def __init__(self, model, keyphrase_sep_token=";", *args, **kwargs):
+        super().__init__(
+            model=AutoModelForSeq2SeqLM.from_pretrained(model),
+            tokenizer=AutoTokenizer.from_pretrained(model),
+            *args,
+            **kwargs
+        )
+        self.keyphrase_sep_token = keyphrase_sep_token
+    def postprocess(self, model_outputs):
+        results = super().postprocess(model_outputs=model_outputs)
+        print(results)
+        return [
+            [
+                keyphrase.strip()
+                for keyphrase in result.get("generated_text").split(
+                    self.keyphrase_sep_token
+                )
+            ]
+            for result in results
+        ][0]

test.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from transformers_interpret import MultiLabelClassificationExplainer
+model_name = "j-hartmann/emotion-english-distilroberta-base"
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+cls_explainer = MultiLabelClassificationExplainer(model, tokenizer)
+word_attributions = cls_explainer("There were many aspects of the film I liked, but it was frightening and gross in parts. My parents hated it.")
+print(word_attributions)