nazneen commited on
Commit
ad6c924
1 Parent(s): 57fbd89

dataset explr app

Browse files
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## LIBRARIES ###
2
+ ## Data
3
+ import numpy as np
4
+ import pandas as pd
5
+ import torch
6
+ import json
7
+ from tqdm import tqdm
8
+ from math import floor
9
+ from datasets import load_dataset
10
+ from collections import defaultdict
11
+ from transformers import AutoTokenizer
12
+ pd.options.display.float_format = '${:,.2f}'.format
13
+
14
+ # Analysis
15
+
16
+ # App & Visualization
17
+ import streamlit as st
18
+ from bokeh.models import CustomJS, ColumnDataSource, HoverTool, BoxSelectTool, Callback, Select, TextInput, DataTable, TableColumn
19
+ from bokeh.events import SelectionGeometry
20
+ from bokeh.plotting import figure, output_file, show
21
+ from bokeh.transform import factor_cmap
22
+ from bokeh.palettes import Category20c_20
23
+ from bokeh.layouts import column, row
24
+
25
+ # utils
26
+ from random import sample
27
+
28
+ def datasets_explorer_viz(df):
29
+ s = ColumnDataSource(df)
30
+ text_input = TextInput(value="", title="Search")
31
+ text_input.js_on_change("value", CustomJS(code="""
32
+ console.log('text_input: value=' + this.value, this.toString())
33
+ """))
34
+ TOOLTIPS= [("dataset_id", "@dataset_id"), ("task", "@task")]
35
+ color = factor_cmap('task', palette=Category20c_20, factors=df['task'].unique())
36
+ p = figure(plot_width=1000, plot_height=1000, tools="hover,wheel_zoom,pan,box_select", title="Dataset explorer", tooltips=TOOLTIPS, toolbar_location="above")
37
+ p.scatter('x', 'y', size=3, source=s, alpha=0.8,marker='circle',fill_color = color, line_color=color, legend_field = 'task')
38
+ p.legend.location = "bottom_right"
39
+ #p.legend.click_policy="mute"
40
+ p.legend.label_text_font_size="8pt"
41
+ table_source = ColumnDataSource(data=dict())
42
+ columns = [
43
+ # TableColumn(field="x", title="X data"),
44
+ # TableColumn(field="y", title="Y data"),
45
+ TableColumn(field="task", title="Task"),
46
+ TableColumn(field="dataset_id", title="Dataset ID"),
47
+ ]
48
+ data_table = DataTable(source=table_source, columns=columns, width=300)
49
+
50
+ s.selected.js_on_change('indices', CustomJS(args=dict(umap_source=s, table_source=table_source), code="""
51
+ const inds = cb_obj.indices;
52
+ const tableData = table_source.data;
53
+ const umapData = umap_source.data;
54
+
55
+ //tableData['x'] = []
56
+ //tableData['y'] = []
57
+ tableData['task'] = []
58
+ tableData['dataset_id'] = []
59
+
60
+ for (let i = 0; i < inds.length; i++) {
61
+ // tableData['x'].push(umapData['x'][inds[i]])
62
+ // tableData['y'].push(umapData['y'][inds[i]])
63
+ tableData['task'].push(umapData['task'][inds[i]])
64
+ tableData['dataset_id'].push(umapData['dataset_id'][inds[i]])
65
+ }
66
+ table_source.data = tableData;
67
+ table_source.change.emit();
68
+ """
69
+ ))
70
+ show(row(column(text_input,p), data_table))
71
+
72
+
73
+ if __name__ == "__main__":
74
+ ### STREAMLIT APP CONGFIG ###
75
+ st.set_page_config(layout="wide", page_title="Datasets Explorer")
76
+
77
+ #lcol, rcol = st.columns([2, 2])
78
+ # ******* loading the mode and the data
79
+
80
+ ### LOAD DATA AND SESSION VARIABLES ###
81
+ datasets_df = pd.read_parquet('./assets/data/datasets_df.parquet')
82
+ datasets_explorer_viz(datasets_df)
data/datasets_df.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01cb9a03e5bd4e29cecf390e5449a2cb413f9fc73daa0750b23e204397eb1ba6
3
+ size 15238
data/hfid_to_pwcinfo.json ADDED
The diff for this file is too large to render. See raw diff
 
data/paper_dataset_emb.json ADDED
The diff for this file is too large to render. See raw diff
 
data/paper_emb.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4==4.11.1
2
+ bokeh==2.4.2
3
+ datasets==2.0.0
4
+ numpy==1.21.5
5
+ pandas==1.4.2
6
+ requests==2.27.1
7
+ streamlit==1.9.0
8
+ torch==1.10.2
9
+ tqdm==4.64.0
10
+ transformers==4.18.0