File size: 3,156 Bytes
ad6c924
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
## LIBRARIES ###
## Data
import numpy as np
import pandas as pd
import torch
import json
from tqdm import tqdm
from math import floor
from datasets import load_dataset
from collections import defaultdict
from transformers import AutoTokenizer
pd.options.display.float_format = '${:,.2f}'.format

# Analysis

# App & Visualization
import streamlit as st
from bokeh.models import CustomJS, ColumnDataSource, HoverTool, BoxSelectTool, Callback, Select, TextInput, DataTable, TableColumn
from bokeh.events import  SelectionGeometry
from bokeh.plotting import figure, output_file, show
from bokeh.transform import factor_cmap
from bokeh.palettes import Category20c_20
from bokeh.layouts import column, row

# utils
from random import sample

def datasets_explorer_viz(df):
    s = ColumnDataSource(df)
    text_input = TextInput(value="", title="Search")
    text_input.js_on_change("value", CustomJS(code="""
        console.log('text_input: value=' + this.value, this.toString())
    """))
    TOOLTIPS= [("dataset_id", "@dataset_id"), ("task", "@task")]
    color = factor_cmap('task', palette=Category20c_20, factors=df['task'].unique()) 
    p = figure(plot_width=1000, plot_height=1000, tools="hover,wheel_zoom,pan,box_select", title="Dataset explorer", tooltips=TOOLTIPS, toolbar_location="above")
    p.scatter('x', 'y', size=3, source=s, alpha=0.8,marker='circle',fill_color = color, line_color=color, legend_field = 'task')
    p.legend.location = "bottom_right"
    #p.legend.click_policy="mute"
    p.legend.label_text_font_size="8pt"
    table_source = ColumnDataSource(data=dict())
    columns = [
        # TableColumn(field="x", title="X data"),
        # TableColumn(field="y", title="Y data"),
        TableColumn(field="task", title="Task"),
        TableColumn(field="dataset_id", title="Dataset ID"),
    ]
    data_table = DataTable(source=table_source, columns=columns, width=300)

    s.selected.js_on_change('indices', CustomJS(args=dict(umap_source=s, table_source=table_source), code="""
            const inds = cb_obj.indices;
            const tableData = table_source.data;
            const umapData = umap_source.data;
            
            //tableData['x'] = []
            //tableData['y'] = []
            tableData['task'] = []
            tableData['dataset_id'] = []

            for (let i = 0; i < inds.length; i++) {
                // tableData['x'].push(umapData['x'][inds[i]])
                // tableData['y'].push(umapData['y'][inds[i]])
                tableData['task'].push(umapData['task'][inds[i]])
                tableData['dataset_id'].push(umapData['dataset_id'][inds[i]])
            }
            table_source.data = tableData;
            table_source.change.emit();
    """
    ))
    show(row(column(text_input,p), data_table))


if __name__ == "__main__":
    ### STREAMLIT APP CONGFIG ###
    st.set_page_config(layout="wide", page_title="Datasets Explorer")

    #lcol, rcol = st.columns([2, 2])
    # ******* loading the mode and the data

    ### LOAD DATA AND SESSION VARIABLES ###
    datasets_df = pd.read_parquet('./assets/data/datasets_df.parquet')
    datasets_explorer_viz(datasets_df)