lambdaofgod commited on
Commit
3af9af7
·
1 Parent(s): f6555fb

feat: Add PapersWithCode tasks visualization

Browse files
Files changed (2) hide show
  1. app.py +9 -3
  2. task_visualizations.py +72 -0
app.py CHANGED
@@ -110,19 +110,25 @@ with gr.Blocks() as demo:
110
  with gr.Tab("Explore Repository Representations"):
111
  setup_repository_representations_tab(repos, representation_types)
112
  with gr.Tab("Explore PapersWithCode Tasks"):
113
- gr.Markdown("## PapersWithCode Tasks Visualization")
 
 
 
 
 
 
114
 
115
  with gr.Row():
116
  min_task_counts_slider = gr.Slider(
117
  minimum=10,
118
  maximum=1000,
119
- value=10,
120
  step=10,
121
  label="Minimum Task Count",
122
  )
123
  update_button = gr.Button("Update Plots")
124
 
125
- with gr.Column("Task Counts"):
126
  all_repos_tasks_plot = gr.Plot(label="All Repositories")
127
  selected_repos_tasks_plot = gr.Plot(label="Selected Repositories")
128
 
 
110
  with gr.Tab("Explore Repository Representations"):
111
  setup_repository_representations_tab(repos, representation_types)
112
  with gr.Tab("Explore PapersWithCode Tasks"):
113
+ task_counts_description = """
114
+ ## PapersWithCode Tasks Visualization
115
+
116
+ PapersWithCode tasks are grouped by area.
117
+ """.strip()
118
+
119
+ gr.Markdown(task_counts_description)
120
 
121
  with gr.Row():
122
  min_task_counts_slider = gr.Slider(
123
  minimum=10,
124
  maximum=1000,
125
+ value=100,
126
  step=10,
127
  label="Minimum Task Count",
128
  )
129
  update_button = gr.Button("Update Plots")
130
 
131
+ with gr.Row("Task Counts"):
132
  all_repos_tasks_plot = gr.Plot(label="All Repositories")
133
  selected_repos_tasks_plot = gr.Plot(label="Selected Repositories")
134
 
task_visualizations.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import ast
3
+ import json
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+
7
+
8
+ class TaskVisualizations:
9
+ def __init__(
10
+ self, task_counts_path, selected_task_counts_path, tasks_with_areas_path
11
+ ):
12
+ self.tasks_with_areas_df = self.load_tasks_with_areas_df(
13
+ task_counts_path, tasks_with_areas_path
14
+ )
15
+ self.selected_tasks_with_areas_df = self.load_tasks_with_areas_df(
16
+ selected_task_counts_path, tasks_with_areas_path
17
+ )
18
+
19
+ @classmethod
20
+ def load_tasks_with_areas_df(
21
+ cls, task_counts_path, tasks_with_areas_path="data/paperswithcode_tasks.csv"
22
+ ):
23
+ task_counts_df = pd.read_csv(task_counts_path)
24
+ raw_tasks_with_areas_df = pd.read_csv(tasks_with_areas_path)
25
+ return raw_tasks_with_areas_df.merge(task_counts_df, on="task")
26
+
27
+ @classmethod
28
+ def get_topk_merge_others(cls, df, by_col, val_col, k=10, val_threshold=1000):
29
+ sorted_df = df.copy().sort_values(val_col, ascending=False)
30
+ topk_dict = (
31
+ sorted_df[[by_col, val_col]].set_index(by_col).iloc[:k].to_dict()[val_col]
32
+ )
33
+ print(topk_dict)
34
+ sorted_df[by_col] = sorted_df[by_col].apply(
35
+ lambda k: k
36
+ if k in topk_dict.keys() and topk_dict[k] >= val_threshold
37
+ else "other"
38
+ )
39
+ sorted_df = sorted_df.groupby(by_col).agg({val_col: sum})
40
+ return sorted_df
41
+
42
+ @classmethod
43
+ def get_displayed_tasks_with_areas_df(cls, tasks_with_areas_df, min_task_count):
44
+ displayed_tasks_with_areas_df = tasks_with_areas_df.dropna().copy()
45
+ displayed_tasks_with_areas_df["task"] = displayed_tasks_with_areas_df.apply(
46
+ lambda r: r["task"] if r["count"] >= min_task_count else "other", axis=1
47
+ )
48
+ displayed_tasks_with_areas_df = (
49
+ displayed_tasks_with_areas_df.groupby("area")
50
+ .apply(lambda df: cls.get_topk_merge_others(df, "task", "count"))
51
+ .reset_index()
52
+ )
53
+ displayed_tasks_with_areas_df["task"] = (
54
+ displayed_tasks_with_areas_df["task"]
55
+ + " "
56
+ + displayed_tasks_with_areas_df["count"].apply(str)
57
+ )
58
+ return displayed_tasks_with_areas_df
59
+
60
+ def get_tasks_sunburst(self, min_task_count, which_df="selected"):
61
+ if which_df == "selected":
62
+ df = self.selected_tasks_with_areas_df
63
+ else:
64
+ df = self.tasks_with_areas_df
65
+
66
+ displayed_tasks_with_areas_df = self.get_displayed_tasks_with_areas_df(
67
+ df, min_task_count
68
+ )
69
+
70
+ return px.sunburst(
71
+ displayed_tasks_with_areas_df, path=["area", "task"], values="count"
72
+ )