File size: 2,493 Bytes
6c21ae3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78f7e42
 
6c21ae3
 
 
 
 
 
78f7e42
 
6c21ae3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from datasets import load_dataset
import streamlit as st
from ast import literal_eval
import pandas as pd


nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering",
            "translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
]
audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
tabular = ["tabular-classification", "tabular-regression"]

modalities = {
    "nlp": nlp_tasks,
    "audio": audio_tasks,
    "cv": cv_tasks,
    "multimodal": multimodal,
    "tabular": tabular,
    "rl": ["reinforcement-learning"]
}

def modality(row):
    pipeline = row["pipeline"]
    for modality, tasks in modalities.items():
        if pipeline in tasks:
            return modality
    if type(pipeline) == "str":
        return "unk_modality"
    return None

st.cache(allow_output_mutation=True)
def process_dataset(version):
    # Load dataset at specified revision
    dataset = load_dataset("open-source-metrics/model-repos-stats", revision=version)

    # Convert to pandas dataframe
    data = dataset["train"].to_pandas()

    # Add modality column
    data["modality"] = data.apply(modality, axis=1)

    # Bin the model card length into some bins
    data["length_bins"] = pd.cut(data["text_length"], [0, 200, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000])

    return data

def eval_tags(row):
    tags = row["tags"]
    if tags == "none" or tags == [] or tags == "{}":
        return []
    if tags[0] != "[":
        tags = str([tags])
    val = literal_eval(tags)
    if isinstance(val, dict):
        return []
    return val

def change_pct(old, new):
    if new == 0:
        return -10000000
    return round(100* (new - old) / new, 3)

def change_and_delta(old_old, old, new):
    curr_change = change_pct(old, new)
    prev_change = change_pct(old_old, old)
    delta = round(curr_change-prev_change, 3)
    if delta > 0:
        delta = f"+{delta}%"
    curr_change = f"{curr_change}%"
    return curr_change, delta