osanseviero commited on
Commit
8be36e0
1 Parent(s): 613ab12

Allow comparing versions

Browse files
Files changed (2) hide show
  1. changelog.md +7 -0
  2. models.py +176 -52
changelog.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Changelog
2
+
3
+ v0.1
4
+ - Allow pick comparison version
5
+ - Show delta in all metrics
6
+ - Allow picking not transformers in pipeline and library page
7
+ - Show old and new metrics in license, language, and libraries raw tables, with delta columns
models.py CHANGED
@@ -34,7 +34,7 @@ def main():
34
  return "unk_modality"
35
  return None
36
 
37
- supported_revisions = ["27_09_22"]
38
 
39
  st.cache(allow_output_mutation=True)
40
  def process_dataset(version):
@@ -52,10 +52,20 @@ def main():
52
 
53
  return data
54
 
55
- base = st.selectbox(
56
- 'What revision do you want to use',
57
- supported_revisions)
58
- data = process_dataset(base)
 
 
 
 
 
 
 
 
 
 
59
 
60
  def eval_tags(row):
61
  tags = row["tags"]
@@ -68,10 +78,12 @@ def main():
68
  return []
69
  return val
70
 
 
71
  data["tags"] = data.apply(eval_tags, axis=1)
72
 
 
73
  total_samples = data.shape[0]
74
- st.metric(label="Total models", value=total_samples)
75
 
76
  # Tabs don't work in Spaces st version
77
  #tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
@@ -86,9 +98,13 @@ def main():
86
 
87
  data.loc[data.languages == "False", 'languages'] = None
88
  data.loc[data.languages == {}, 'languages'] = None
 
 
89
 
90
  no_lang_count = data["languages"].isna().sum()
 
91
  data["languages"] = data["languages"].fillna('none')
 
92
 
93
  def make_list(row):
94
  languages = row["languages"]
@@ -103,19 +119,28 @@ def main():
103
 
104
  data["languages"] = data.apply(make_list, axis=1)
105
  data["language_count"] = data.apply(language_count, axis=1)
 
 
106
 
107
  models_with_langs = data[data["language_count"] > 0]
108
  langs = models_with_langs["languages"].explode()
109
  langs = langs[langs != {}]
110
  total_langs = len(langs.unique())
111
 
 
 
 
 
 
112
  col1, col2, col3 = st.columns(3)
113
  with col1:
114
- st.metric(label="Language Specified", value=total_samples-no_lang_count)
 
 
115
  with col2:
116
- st.metric(label="No Language Specified", value=no_lang_count)
117
  with col3:
118
- st.metric(label="Total Unique Languages", value=total_langs)
119
 
120
  st.subheader("Count of languages per model repo")
121
  st.text("Some repos are for multiple languages, so the count is greater than 1")
@@ -132,6 +157,8 @@ def main():
132
 
133
  models_with_langs = data[data["language_count"] > filter]
134
  df1 = models_with_langs['language_count'].value_counts()
 
 
135
  st.bar_chart(df1)
136
 
137
  st.subheader("Most frequent languages")
@@ -150,9 +177,14 @@ def main():
150
  models_with_langs = data[data["language_count"] > 0]
151
  langs = models_with_langs["languages"].explode()
152
  langs = langs[langs != {}]
153
-
154
  orig_d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
155
  d = orig_d
 
 
 
 
 
 
156
  if filter == 1:
157
  d = orig_d.iloc[1:]
158
  elif filter == 2:
@@ -167,10 +199,22 @@ def main():
167
  ))
168
 
169
  st.subheader("Raw Data")
170
- l = df1.rename_axis("lang_count").reset_index().rename(columns={"language_count": "repos_count"})
171
- st.dataframe(l)
 
 
 
 
 
 
172
  d = orig_d.astype(str)
173
- st.dataframe(d)
 
 
 
 
 
 
174
 
175
 
176
 
@@ -179,13 +223,18 @@ def main():
179
  st.header("License info")
180
 
181
  no_license_count = data["license"].isna().sum()
 
182
  col1, col2, col3 = st.columns(3)
183
  with col1:
184
- st.metric(label="License Specified", value=total_samples-no_license_count)
 
 
185
  with col2:
186
- st.metric(label="No license Specified", value=no_license_count)
187
  with col3:
188
- st.metric(label="Total Unique Licenses", value=len(data["license"].unique()))
 
 
189
 
190
  st.subheader("Distribution of licenses per model repo")
191
  license_filter = st.selectbox(
@@ -217,7 +266,12 @@ def main():
217
 
218
  st.subheader("Raw Data")
219
  d = data["license"].value_counts().rename_axis("license").to_frame('counts').reset_index()
220
- st.dataframe(d)
 
 
 
 
 
221
 
222
  #with tab3:
223
  if tab == "Pipeline":
@@ -229,14 +283,24 @@ def main():
229
  s = s[s.apply(type) == str]
230
  unique_tags = len(s.unique())
231
 
 
 
 
 
 
 
232
  no_pipeline_count = data["pipeline"].isna().sum()
 
 
233
  col1, col2, col3 = st.columns(3)
234
  with col1:
235
- st.metric(label="# models that have any pipeline", value=total_samples-no_pipeline_count)
 
 
236
  with col2:
237
- st.metric(label="No pipeline Specified", value=no_pipeline_count)
238
  with col3:
239
- st.metric(label="Total Unique Pipelines", value=len(data["pipeline"].unique()))
240
 
241
  pipeline_filter = st.selectbox(
242
  'Modalities',
@@ -260,19 +324,26 @@ def main():
260
 
261
  st.subheader("High-level metrics")
262
  filtered_data = data[data['pipeline'].notna()]
 
263
 
264
  if filter == 1:
265
  filtered_data = data[data["modality"] == "nlp"]
 
266
  elif filter == 2:
267
  filtered_data = data[data["modality"] == "cv"]
 
268
  elif filter == 3:
269
  filtered_data = data[data["modality"] == "audio"]
 
270
  elif filter == 4:
271
  filtered_data = data[data["modality"] == "rl"]
 
272
  elif filter == 5:
273
  filtered_data = data[data["modality"] == "multimodal"]
 
274
  elif filter == 6:
275
  filtered_data = data[data["modality"] == "tabular"]
 
276
 
277
  col1, col2, col3 = st.columns(3)
278
  with col1:
@@ -283,7 +354,7 @@ def main():
283
  with col2:
284
  l = st.selectbox(
285
  'What library do you want to see?',
286
- ["all", *filtered_data["library"].unique()]
287
  )
288
  with col3:
289
  f = st.selectbox(
@@ -323,17 +394,26 @@ def main():
323
 
324
  if p != "all":
325
  filtered_data = filtered_data[filtered_data["pipeline"] == p]
326
- if l != "all":
 
327
  filtered_data = filtered_data[filtered_data["library"] == l]
 
 
 
 
328
  if f != "all":
329
  if f == "py":
330
  filtered_data = filtered_data[filtered_data["pytorch"] == 1]
 
331
  elif f == "tf":
332
  filtered_data = filtered_data[filtered_data["tensorflow"] == 1]
 
333
  elif f == "jax":
334
  filtered_data = filtered_data[filtered_data["jax"] == 1]
 
335
  if filt != []:
336
  filtered_data = filtered_data[filtered_data.apply(filter_fn, axis=1)]
 
337
 
338
 
339
  d = filtered_data["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
@@ -344,23 +424,31 @@ def main():
344
  )
345
  sums = grouped_data.sum()
346
 
 
 
 
 
 
 
 
 
347
  col1, col2, col3 = st.columns(3)
348
  with col1:
349
- st.metric(label="Total models", value=filtered_data.shape[0])
350
  with col2:
351
- st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"])
352
  with col3:
353
- st.metric(label="Cumulative likes", value=sums["likes"])
354
 
355
  col1, col2, col3 = st.columns(3)
356
  with col1:
357
- st.metric(label="Total in PT", value=sums["pytorch"])
358
  with col2:
359
- st.metric(label="Total in TF", value=sums["tensorflow"])
360
  with col3:
361
- st.metric(label="Total in JAX", value=sums["jax"])
362
 
363
- st.metric(label="Unique Tags", value=unique_tags)
364
 
365
 
366
 
@@ -414,24 +502,25 @@ def main():
414
 
415
  columns_of_interest = ["prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]
416
  sums = data[columns_of_interest].sum()
 
417
 
418
  col1, col2, col3, col4 = st.columns(4)
419
  with col1:
420
- st.metric(label="Total PRs", value=sums["prs_count"])
421
  with col2:
422
- st.metric(label="PRs opened", value=sums["prs_open"])
423
  with col3:
424
- st.metric(label="PRs merged", value=sums["prs_merged"])
425
  with col4:
426
- st.metric(label="PRs closed", value=sums["prs_closed"])
427
 
428
  col1, col2, col3 = st.columns(3)
429
  with col1:
430
- st.metric(label="Total discussions", value=sums["discussions_count"])
431
  with col2:
432
- st.metric(label="Discussions open", value=sums["discussions_open"])
433
  with col3:
434
- st.metric(label="Discussions closed", value=sums["discussions_closed"])
435
 
436
  filtered_data = data[["repo_id", "prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]].sort_values("prs_count", ascending=False).reset_index(drop=True)
437
  st.dataframe(filtered_data)
@@ -441,23 +530,29 @@ def main():
441
  st.header("Library info")
442
 
443
  no_library_count = data["library"].isna().sum()
 
444
  col1, col2, col3 = st.columns(3)
445
  with col1:
446
- st.metric(label="# models that have any library", value=total_samples-no_library_count)
 
 
447
  with col2:
448
- st.metric(label="No library Specified", value=no_library_count)
449
  with col3:
450
- st.metric(label="Total Unique library", value=len(data["library"].unique()))
 
 
451
 
452
 
453
  st.subheader("High-level metrics")
454
  filtered_data = data[data['library'].notna()]
 
455
 
456
  col1, col2 = st.columns(2)
457
  with col1:
458
  lib = st.selectbox(
459
  'What library do you want to see? ',
460
- ["all", *filtered_data["library"].unique()]
461
  )
462
  with col2:
463
  pip = st.selectbox(
@@ -465,11 +560,15 @@ def main():
465
  ["all", *filtered_data["pipeline"].unique()]
466
  )
467
 
468
- if pip != "all":
469
  filtered_data = filtered_data[filtered_data["pipeline"] == pip]
470
- if lib != "all":
 
471
  filtered_data = filtered_data[filtered_data["library"] == lib]
472
-
 
 
 
473
 
474
  d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index()
475
  grouped_data = filtered_data.groupby("library").sum()[["downloads_30d", "likes"]]
@@ -478,13 +577,23 @@ def main():
478
  )
479
  sums = grouped_data.sum()
480
 
 
 
 
 
 
 
 
 
481
  col1, col2, col3 = st.columns(3)
482
  with col1:
483
- st.metric(label="Total models", value=filtered_data.shape[0])
 
 
484
  with col2:
485
- st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"])
486
  with col3:
487
- st.metric(label="Cumulative likes", value=sums["likes"])
488
 
489
  st.subheader("Most common library types (Learn more in library tab)")
490
  d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index().head(15)
@@ -496,6 +605,13 @@ def main():
496
 
497
 
498
  st.subheader("Aggregated Data")
 
 
 
 
 
 
 
499
  st.dataframe(final_data)
500
 
501
  st.subheader("Raw Data")
@@ -509,28 +625,36 @@ def main():
509
 
510
  columns_of_interest = ["has_model_index", "has_metadata", "has_text", "text_length"]
511
  rows = data.shape[0]
 
512
 
513
  cond = data["has_model_index"] | data["has_text"]
514
  with_model_card = data[cond]
515
  c_model_card = with_model_card.shape[0]
 
 
 
 
 
516
  st.subheader("High-level metrics")
517
  col1, col2, col3 = st.columns(3)
518
  with col1:
519
- st.metric(label="# models with model card file", value=c_model_card)
520
  with col2:
521
- st.metric(label="# models without model card file", value=rows-c_model_card)
522
 
523
  with_index = data["has_model_index"].sum()
 
524
  with col1:
525
- st.metric(label="# models with model index", value=with_index)
526
  with col2:
527
- st.metric(label="# models without model index", value=rows-with_index)
528
 
529
  with_text = data["has_text"]
 
530
  with col1:
531
- st.metric(label="# models with model card text", value=with_text.sum())
532
  with col2:
533
- st.metric(label="# models without model card text", value=rows-with_text.sum())
534
 
535
 
536
  st.subheader("Length (chars) of model card content")
 
34
  return "unk_modality"
35
  return None
36
 
37
+ supported_revisions = ["03_10_22", "27_09_22"]
38
 
39
  st.cache(allow_output_mutation=True)
40
  def process_dataset(version):
 
52
 
53
  return data
54
 
55
+ col1, col2 = st.columns(2)
56
+ with col1:
57
+ base = st.selectbox(
58
+ 'Old revision',
59
+ supported_revisions,
60
+ index=1)
61
+ with col2:
62
+ new = st.selectbox(
63
+ 'Last revision',
64
+ supported_revisions,
65
+ index=0)
66
+
67
+ old_data = process_dataset(base)
68
+ data = process_dataset(new)
69
 
70
  def eval_tags(row):
71
  tags = row["tags"]
 
78
  return []
79
  return val
80
 
81
+ old_data["tags"] = old_data.apply(eval_tags, axis=1)
82
  data["tags"] = data.apply(eval_tags, axis=1)
83
 
84
+ total_samples_old = old_data.shape[0]
85
  total_samples = data.shape[0]
86
+ st.metric(label="Total models", value=total_samples, delta=total_samples-total_samples_old)
87
 
88
  # Tabs don't work in Spaces st version
89
  #tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
 
98
 
99
  data.loc[data.languages == "False", 'languages'] = None
100
  data.loc[data.languages == {}, 'languages'] = None
101
+ old_data.loc[old_data.languages == "False", 'languages'] = None
102
+ old_data.loc[old_data.languages == {}, 'languages'] = None
103
 
104
  no_lang_count = data["languages"].isna().sum()
105
+ no_lang_count_old = old_data["languages"].isna().sum()
106
  data["languages"] = data["languages"].fillna('none')
107
+ old_data["languages"] = old_data["languages"].fillna('none')
108
 
109
  def make_list(row):
110
  languages = row["languages"]
 
119
 
120
  data["languages"] = data.apply(make_list, axis=1)
121
  data["language_count"] = data.apply(language_count, axis=1)
122
+ old_data["languages"] = old_data.apply(make_list, axis=1)
123
+ old_data["language_count"] = old_data.apply(language_count, axis=1)
124
 
125
  models_with_langs = data[data["language_count"] > 0]
126
  langs = models_with_langs["languages"].explode()
127
  langs = langs[langs != {}]
128
  total_langs = len(langs.unique())
129
 
130
+ models_with_langs_old = old_data[old_data["language_count"] > 0]
131
+ langs_old = models_with_langs_old["languages"].explode()
132
+ langs_old = langs_old[langs_old != {}]
133
+ total_langs_old = len(langs_old.unique())
134
+
135
  col1, col2, col3 = st.columns(3)
136
  with col1:
137
+ v = total_samples-no_lang_count
138
+ v_old = total_samples_old-no_lang_count_old
139
+ st.metric(label="Language Specified", value=v, delta=int(v-v_old))
140
  with col2:
141
+ st.metric(label="No Language Specified", value=no_lang_count, delta=int(no_lang_count-no_lang_count_old))
142
  with col3:
143
+ st.metric(label="Total Unique Languages", value=total_langs, delta=int(total_langs-total_langs_old))
144
 
145
  st.subheader("Count of languages per model repo")
146
  st.text("Some repos are for multiple languages, so the count is greater than 1")
 
157
 
158
  models_with_langs = data[data["language_count"] > filter]
159
  df1 = models_with_langs['language_count'].value_counts()
160
+ models_with_langs_old = old_data[old_data["language_count"] > filter]
161
+ df1_old = models_with_langs_old['language_count'].value_counts()
162
  st.bar_chart(df1)
163
 
164
  st.subheader("Most frequent languages")
 
177
  models_with_langs = data[data["language_count"] > 0]
178
  langs = models_with_langs["languages"].explode()
179
  langs = langs[langs != {}]
 
180
  orig_d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
181
  d = orig_d
182
+
183
+ models_with_langs_old = old_data[old_data["language_count"] > 0]
184
+ langs = models_with_langs_old["languages"].explode()
185
+ langs = langs[langs != {}]
186
+ orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
187
+
188
  if filter == 1:
189
  d = orig_d.iloc[1:]
190
  elif filter == 2:
 
199
  ))
200
 
201
  st.subheader("Raw Data")
202
+ l = df1.rename_axis("lang_count").reset_index().rename(columns={"language_count": "r_c"})
203
+ l_old = df1_old.rename_axis("lang_count").reset_index().rename(columns={"language_count": "old_r_c"})
204
+ final_data = pd.merge(
205
+ l, l_old, how="outer", on="lang_count"
206
+ )
207
+ final_data["diff"] = final_data["r_c"] - final_data["old_r_c"]
208
+ st.dataframe(final_data)
209
+
210
  d = orig_d.astype(str)
211
+ orig_d_old = orig_d_old.astype(str).rename(columns={"counts": "old_c"})
212
+ final_data = pd.merge(
213
+ d, orig_d_old, how="outer", on="language"
214
+ )
215
+ final_data["diff"] = final_data["counts"].astype(int) - final_data["old_c"].astype(int)
216
+
217
+ st.dataframe(final_data)
218
 
219
 
220
 
 
223
  st.header("License info")
224
 
225
  no_license_count = data["license"].isna().sum()
226
+ no_license_count_old = old_data["license"].isna().sum()
227
  col1, col2, col3 = st.columns(3)
228
  with col1:
229
+ v = total_samples-no_license_count
230
+ v_old = total_samples_old-no_license_count_old
231
+ st.metric(label="License Specified", value=v, delta=int(v-v_old))
232
  with col2:
233
+ st.metric(label="No license Specified", value=no_license_count, delta=int(no_license_count-no_license_count_old))
234
  with col3:
235
+ unique_licenses = len(data["license"].unique())
236
+ unique_licenses_old = len(old_data["license"].unique())
237
+ st.metric(label="Total Unique Licenses", value=unique_licenses, delta=int(unique_licenses-unique_licenses_old))
238
 
239
  st.subheader("Distribution of licenses per model repo")
240
  license_filter = st.selectbox(
 
266
 
267
  st.subheader("Raw Data")
268
  d = data["license"].value_counts().rename_axis("license").to_frame('counts').reset_index()
269
+ d_old = old_data["license"].value_counts().rename_axis("license").to_frame('counts').reset_index().rename(columns={"counts": "old_c"})
270
+ final_data = pd.merge(
271
+ d, d_old, how="outer", on="license"
272
+ )
273
+ final_data["diff"] = final_data["counts"] - final_data["old_c"]
274
+ st.dataframe(final_data)
275
 
276
  #with tab3:
277
  if tab == "Pipeline":
 
283
  s = s[s.apply(type) == str]
284
  unique_tags = len(s.unique())
285
 
286
+ tags_old = old_data["tags"].explode()
287
+ tags_old = tags_old[tags_old.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
288
+ s = tags_old["tag"]
289
+ s = s[s.apply(type) == str]
290
+ unique_tags_old = len(s.unique())
291
+
292
  no_pipeline_count = data["pipeline"].isna().sum()
293
+ no_pipeline_count_old = old_data["pipeline"].isna().sum()
294
+
295
  col1, col2, col3 = st.columns(3)
296
  with col1:
297
+ v = total_samples-no_pipeline_count
298
+ v_old = total_samples_old-no_pipeline_count_old
299
+ st.metric(label="# models that have any pipeline", value=v, delta=int(v-v_old))
300
  with col2:
301
+ st.metric(label="No pipeline Specified", value=no_pipeline_count, delta=int(no_pipeline_count-no_pipeline_count_old))
302
  with col3:
303
+ st.metric(label="Total Unique Tags", value=unique_tags, delta=int(unique_tags-unique_tags_old))
304
 
305
  pipeline_filter = st.selectbox(
306
  'Modalities',
 
324
 
325
  st.subheader("High-level metrics")
326
  filtered_data = data[data['pipeline'].notna()]
327
+ filtered_data_old = old_data[old_data['pipeline'].notna()]
328
 
329
  if filter == 1:
330
  filtered_data = data[data["modality"] == "nlp"]
331
+ filtered_data_old = old_data[old_data["modality"] == "nlp"]
332
  elif filter == 2:
333
  filtered_data = data[data["modality"] == "cv"]
334
+ filtered_data_old = old_data[old_data["modality"] == "cv"]
335
  elif filter == 3:
336
  filtered_data = data[data["modality"] == "audio"]
337
+ filtered_data_old = old_data[old_data["modality"] == "audio"]
338
  elif filter == 4:
339
  filtered_data = data[data["modality"] == "rl"]
340
+ filtered_data_old = old_data[old_data["modality"] == "rl"]
341
  elif filter == 5:
342
  filtered_data = data[data["modality"] == "multimodal"]
343
+ filtered_data_old = old_data[old_data["modality"] == "multimodal"]
344
  elif filter == 6:
345
  filtered_data = data[data["modality"] == "tabular"]
346
+ filtered_data_old = old_data[old_data["modality"] == "tabular"]
347
 
348
  col1, col2, col3 = st.columns(3)
349
  with col1:
 
354
  with col2:
355
  l = st.selectbox(
356
  'What library do you want to see?',
357
+ ["all", "not transformers", *filtered_data["library"].unique()]
358
  )
359
  with col3:
360
  f = st.selectbox(
 
394
 
395
  if p != "all":
396
  filtered_data = filtered_data[filtered_data["pipeline"] == p]
397
+ filtered_data_old = filtered_data_old[filtered_data_old["pipeline"] == p]
398
+ if l != "all" and l != "not transformers":
399
  filtered_data = filtered_data[filtered_data["library"] == l]
400
+ filtered_data_old = filtered_data_old[filtered_data_old["library"] == l]
401
+ if l == "not transformers":
402
+ filtered_data = filtered_data[filtered_data["library"] != "transformers"]
403
+ filtered_data_old = filtered_data_old[filtered_data_old["library"] != "transformers"]
404
  if f != "all":
405
  if f == "py":
406
  filtered_data = filtered_data[filtered_data["pytorch"] == 1]
407
+ filtered_data_old = filtered_data_old[filtered_data_old["pytorch"] == 1]
408
  elif f == "tf":
409
  filtered_data = filtered_data[filtered_data["tensorflow"] == 1]
410
+ filtered_data_old = filtered_data_old[filtered_data_old["tensorflow"] == 1]
411
  elif f == "jax":
412
  filtered_data = filtered_data[filtered_data["jax"] == 1]
413
+ filtered_data_old = filtered_data_old[filtered_data_old["jax"] == 1]
414
  if filt != []:
415
  filtered_data = filtered_data[filtered_data.apply(filter_fn, axis=1)]
416
+ filtered_data_old = filtered_data_old[filtered_data_old.apply(filter_fn, axis=1)]
417
 
418
 
419
  d = filtered_data["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
 
424
  )
425
  sums = grouped_data.sum()
426
 
427
+ d_old = filtered_data_old["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
428
+ grouped_data_old = filtered_data_old.groupby("pipeline").sum()[columns_of_interest]
429
+ final_data_old = pd.merge(
430
+ d_old, grouped_data_old, how="outer", on="pipeline"
431
+ )
432
+ sums = grouped_data.sum()
433
+ sums_old = grouped_data_old.sum()
434
+
435
  col1, col2, col3 = st.columns(3)
436
  with col1:
437
+ st.metric(label="Total models", value=filtered_data.shape[0], delta=int(filtered_data.shape[0] - filtered_data_old.shape[0]))
438
  with col2:
439
+ st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"], delta=int(sums["downloads_30d"] - sums_old["downloads_30d"]))
440
  with col3:
441
+ st.metric(label="Cumulative likes", value=sums["likes"], delta=int(sums["likes"] - sums_old["likes"]))
442
 
443
  col1, col2, col3 = st.columns(3)
444
  with col1:
445
+ st.metric(label="Total in PT", value=sums["pytorch"], delta=int(sums["pytorch"] - sums_old["pytorch"]))
446
  with col2:
447
+ st.metric(label="Total in TF", value=sums["tensorflow"], delta=int(sums["tensorflow"] - sums_old["tensorflow"]))
448
  with col3:
449
+ st.metric(label="Total in JAX", value=sums["jax"], delta=int(sums["jax"] - sums_old["jax"]))
450
 
451
+ st.metric(label="Unique Tags", value=unique_tags, delta=int(unique_tags - unique_tags_old))
452
 
453
 
454
 
 
502
 
503
  columns_of_interest = ["prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]
504
  sums = data[columns_of_interest].sum()
505
+ sums_old = old_data[columns_of_interest].sum()
506
 
507
  col1, col2, col3, col4 = st.columns(4)
508
  with col1:
509
+ st.metric(label="Total PRs", value=sums["prs_count"],delta=int(sums["prs_count"] - sums_old["prs_count"]))
510
  with col2:
511
+ st.metric(label="PRs opened", value=sums["prs_open"], delta=int(sums["prs_open"] - sums_old["prs_open"]))
512
  with col3:
513
+ st.metric(label="PRs merged", value=sums["prs_merged"], delta=int(sums["prs_merged"] - sums_old["prs_merged"]))
514
  with col4:
515
+ st.metric(label="PRs closed", value=sums["prs_closed"], delta=int(sums["prs_closed"] - sums_old["prs_closed"]))
516
 
517
  col1, col2, col3 = st.columns(3)
518
  with col1:
519
+ st.metric(label="Total discussions", value=sums["discussions_count"], delta=int(sums["discussions_count"] - sums_old["discussions_count"]))
520
  with col2:
521
+ st.metric(label="Discussions open", value=sums["discussions_open"], delta=int(sums["discussions_open"] - sums_old["discussions_open"]))
522
  with col3:
523
+ st.metric(label="Discussions closed", value=sums["discussions_closed"], delta=int(sums["discussions_closed"] - sums_old["discussions_closed"]))
524
 
525
  filtered_data = data[["repo_id", "prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]].sort_values("prs_count", ascending=False).reset_index(drop=True)
526
  st.dataframe(filtered_data)
 
530
  st.header("Library info")
531
 
532
  no_library_count = data["library"].isna().sum()
533
+ no_library_count_old = old_data["library"].isna().sum()
534
  col1, col2, col3 = st.columns(3)
535
  with col1:
536
+ v = total_samples-no_library_count
537
+ v_old = total_samples_old-no_library_count_old
538
+ st.metric(label="# models that have any library", value=v, delta=int(v-v_old))
539
  with col2:
540
+ st.metric(label="No library Specified", value=no_library_count, delta=int(no_library_count-no_library_count_old))
541
  with col3:
542
+ v = len(data["library"].unique())
543
+ v_old = len(old_data["library"].unique())
544
+ st.metric(label="Total Unique library", value=v, delta=int(v-v_old))
545
 
546
 
547
  st.subheader("High-level metrics")
548
  filtered_data = data[data['library'].notna()]
549
+ filtered_data_old = old_data[old_data['library'].notna()]
550
 
551
  col1, col2 = st.columns(2)
552
  with col1:
553
  lib = st.selectbox(
554
  'What library do you want to see? ',
555
+ ["all", "not transformers", *filtered_data["library"].unique()]
556
  )
557
  with col2:
558
  pip = st.selectbox(
 
560
  ["all", *filtered_data["pipeline"].unique()]
561
  )
562
 
563
+ if pip != "all" :
564
  filtered_data = filtered_data[filtered_data["pipeline"] == pip]
565
+ filtered_data_old = filtered_data_old[filtered_data_old["pipeline"] == pip]
566
+ if lib != "all" and lib != "not transformers":
567
  filtered_data = filtered_data[filtered_data["library"] == lib]
568
+ filtered_data_old = filtered_data_old[filtered_data_old["library"] == lib]
569
+ if lib == "not transformers":
570
+ filtered_data = filtered_data[filtered_data["library"] != "transformers"]
571
+ filtered_data_old = filtered_data_old[filtered_data_old["library"] != "transformers"]
572
 
573
  d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index()
574
  grouped_data = filtered_data.groupby("library").sum()[["downloads_30d", "likes"]]
 
577
  )
578
  sums = grouped_data.sum()
579
 
580
+ d_old = filtered_data_old["library"].value_counts().rename_axis("library").to_frame('counts').reset_index()
581
+ grouped_data_old = filtered_data_old.groupby("library").sum()[["downloads_30d", "likes"]]
582
+ final_data_old = pd.merge(
583
+ d_old, grouped_data_old, how="outer", on="library"
584
+ ).add_suffix('_old')
585
+ final_data_old = final_data_old.rename(index=str, columns={"library_old": "library"})
586
+ sums_old = grouped_data_old.sum()
587
+
588
  col1, col2, col3 = st.columns(3)
589
  with col1:
590
+ v = filtered_data.shape[0]
591
+ v_old = filtered_data_old.shape[0]
592
+ st.metric(label="Total models", value=v, delta=int(v-v_old))
593
  with col2:
594
+ st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"], delta=int(sums["downloads_30d"]-sums_old["downloads_30d"]))
595
  with col3:
596
+ st.metric(label="Cumulative likes", value=sums["likes"], delta=int(sums["likes"]-sums_old["likes"]))
597
 
598
  st.subheader("Most common library types (Learn more in library tab)")
599
  d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index().head(15)
 
605
 
606
 
607
  st.subheader("Aggregated Data")
608
+ final_data = pd.merge(
609
+ final_data, final_data_old, how="outer", on="library"
610
+ )
611
+ final_data["counts_diff"] = final_data["counts"] - final_data["counts_old"]
612
+ final_data["downloads_diff"] = final_data["downloads_30d"] - final_data["downloads_30d_old"]
613
+ final_data["likes_diff"] = final_data["likes"] - final_data["likes_old"]
614
+
615
  st.dataframe(final_data)
616
 
617
  st.subheader("Raw Data")
 
625
 
626
  columns_of_interest = ["has_model_index", "has_metadata", "has_text", "text_length"]
627
  rows = data.shape[0]
628
+ rows_old = old_data.shape[0]
629
 
630
  cond = data["has_model_index"] | data["has_text"]
631
  with_model_card = data[cond]
632
  c_model_card = with_model_card.shape[0]
633
+
634
+ cond = old_data["has_model_index"] | old_data["has_text"]
635
+ with_model_card_old = old_data[cond]
636
+ c_model_card_old = with_model_card_old.shape[0]
637
+
638
  st.subheader("High-level metrics")
639
  col1, col2, col3 = st.columns(3)
640
  with col1:
641
+ st.metric(label="# models with model card file", value=c_model_card, delta=int(c_model_card-c_model_card_old))
642
  with col2:
643
+ st.metric(label="# models without model card file", value=rows-c_model_card, delta=int((rows-c_model_card)-(rows_old-c_model_card_old)))
644
 
645
  with_index = data["has_model_index"].sum()
646
+ with_index_old = old_data["has_model_index"].sum()
647
  with col1:
648
+ st.metric(label="# models with model index", value=with_index, delta=int(with_index-with_index_old))
649
  with col2:
650
+ st.metric(label="# models without model index", value=rows-with_index, delta=int((rows-with_index)-(rows_old-with_index_old)))
651
 
652
  with_text = data["has_text"]
653
+ with_text_old = old_data["has_text"]
654
  with col1:
655
+ st.metric(label="# models with model card text", value=with_text.sum(), delta=int(with_text.sum()-with_text_old.sum()))
656
  with col2:
657
+ st.metric(label="# models without model card text", value=rows-with_text.sum(), delta=int((rows-with_text.sum())-(rows_old-with_text_old.sum())))
658
 
659
 
660
  st.subheader("Length (chars) of model card content")