Spaces:

lambdaofgod
/

github_search_visualizations

Sleeping

App Files Files Community

lambdaofgod commited on Oct 17, 2024

Commit

3af9af7

1 Parent(s): f6555fb

feat: Add PapersWithCode tasks visualization

Browse files

Files changed (2) hide show

app.py +9 -3
task_visualizations.py +72 -0

app.py CHANGED Viewed

@@ -110,19 +110,25 @@ with gr.Blocks() as demo:
     with gr.Tab("Explore Repository Representations"):
         setup_repository_representations_tab(repos, representation_types)
     with gr.Tab("Explore PapersWithCode Tasks"):
-        gr.Markdown("## PapersWithCode Tasks Visualization")
         with gr.Row():
             min_task_counts_slider = gr.Slider(
                 minimum=10,
                 maximum=1000,
-                value=10,
                 step=10,
                 label="Minimum Task Count",
             )
             update_button = gr.Button("Update Plots")
-        with gr.Column("Task Counts"):
             all_repos_tasks_plot = gr.Plot(label="All Repositories")
             selected_repos_tasks_plot = gr.Plot(label="Selected Repositories")

     with gr.Tab("Explore Repository Representations"):
         setup_repository_representations_tab(repos, representation_types)
     with gr.Tab("Explore PapersWithCode Tasks"):
+        task_counts_description = """
+        ## PapersWithCode Tasks Visualization
+        PapersWithCode tasks are grouped by area.
+        """.strip()
+        gr.Markdown(task_counts_description)
         with gr.Row():
             min_task_counts_slider = gr.Slider(
                 minimum=10,
                 maximum=1000,
+                value=100,
                 step=10,
                 label="Minimum Task Count",
             )
             update_button = gr.Button("Update Plots")
+        with gr.Row("Task Counts"):
             all_repos_tasks_plot = gr.Plot(label="All Repositories")
             selected_repos_tasks_plot = gr.Plot(label="Selected Repositories")

task_visualizations.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import pandas as pd
+import ast
+import json
+import plotly.express as px
+import plotly.graph_objects as go
+class TaskVisualizations:
+    def __init__(
+        self, task_counts_path, selected_task_counts_path, tasks_with_areas_path
+    ):
+        self.tasks_with_areas_df = self.load_tasks_with_areas_df(
+            task_counts_path, tasks_with_areas_path
+        )
+        self.selected_tasks_with_areas_df = self.load_tasks_with_areas_df(
+            selected_task_counts_path, tasks_with_areas_path
+        )
+    @classmethod
+    def load_tasks_with_areas_df(
+        cls, task_counts_path, tasks_with_areas_path="data/paperswithcode_tasks.csv"
+    ):
+        task_counts_df = pd.read_csv(task_counts_path)
+        raw_tasks_with_areas_df = pd.read_csv(tasks_with_areas_path)
+        return raw_tasks_with_areas_df.merge(task_counts_df, on="task")
+    @classmethod
+    def get_topk_merge_others(cls, df, by_col, val_col, k=10, val_threshold=1000):
+        sorted_df = df.copy().sort_values(val_col, ascending=False)
+        topk_dict = (
+            sorted_df[[by_col, val_col]].set_index(by_col).iloc[:k].to_dict()[val_col]
+        )
+        print(topk_dict)
+        sorted_df[by_col] = sorted_df[by_col].apply(
+            lambda k: k
+            if k in topk_dict.keys() and topk_dict[k] >= val_threshold
+            else "other"
+        )
+        sorted_df = sorted_df.groupby(by_col).agg({val_col: sum})
+        return sorted_df
+    @classmethod
+    def get_displayed_tasks_with_areas_df(cls, tasks_with_areas_df, min_task_count):
+        displayed_tasks_with_areas_df = tasks_with_areas_df.dropna().copy()
+        displayed_tasks_with_areas_df["task"] = displayed_tasks_with_areas_df.apply(
+            lambda r: r["task"] if r["count"] >= min_task_count else "other", axis=1
+        )
+        displayed_tasks_with_areas_df = (
+            displayed_tasks_with_areas_df.groupby("area")
+            .apply(lambda df: cls.get_topk_merge_others(df, "task", "count"))
+            .reset_index()
+        )
+        displayed_tasks_with_areas_df["task"] = (
+            displayed_tasks_with_areas_df["task"]
+            + " "
+            + displayed_tasks_with_areas_df["count"].apply(str)
+        )
+        return displayed_tasks_with_areas_df
+    def get_tasks_sunburst(self, min_task_count, which_df="selected"):
+        if which_df == "selected":
+            df = self.selected_tasks_with_areas_df
+        else:
+            df = self.tasks_with_areas_df
+        displayed_tasks_with_areas_df = self.get_displayed_tasks_with_areas_df(
+            df, min_task_count
+        )
+        return px.sunburst(
+            displayed_tasks_with_areas_df, path=["area", "task"], values="count"
+        )