jsulz HF staff commited on
Commit
6118d5d
·
1 Parent(s): 9c5987b

updating layout, adding some additional dedupe analysis columns, adding new area chart

Browse files
Files changed (2) hide show
  1. app.py +77 -27
  2. pyproject.toml +1 -0
app.py CHANGED
@@ -40,11 +40,6 @@ def process_dataset():
40
  # drop any nas
41
  file_extensions_by_month = file_extensions_by_month.dropna()
42
 
43
- # Convert the total size to petabytes and format to two decimal places
44
- file_counts_and_sizes = format_dataframe_size_column(
45
- file_counts_and_sizes, "total_size"
46
- )
47
-
48
  file_counts_and_sizes["type"] = file_counts_and_sizes["type"].str.capitalize()
49
  # update the column name to 'total size (PB)'
50
  file_counts_and_sizes = file_counts_and_sizes.rename(
@@ -54,6 +49,8 @@ def process_dataset():
54
  "total_size": "Total Size (PBs)",
55
  }
56
  )
 
 
57
  # sort the dataframe by total size in descending order
58
  file_counts_and_sizes = file_counts_and_sizes.sort_values(
59
  by="Total Size (PBs)", ascending=False
@@ -80,7 +77,7 @@ def format_dataframe_size_column(_df, column_name):
80
  return _df
81
 
82
 
83
- def cumulative_growth_plot_analysis(df, df_compressed):
84
  """
85
  Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
86
 
@@ -121,7 +118,16 @@ def cumulative_growth_plot_analysis(df, df_compressed):
121
  last_10_months = cumulative_df.tail(10).copy()
122
  last_10_months["total"] = last_10_months.sum(axis=1)
123
  last_10_months["total_change"] = last_10_months["total"].diff()
 
 
 
 
 
 
 
 
124
  last_10_months = format_dataframe_size_column(last_10_months, "total_change")
 
125
  last_10_months["date"] = cumulative_df.tail(10).index
126
  # drop the dataset, model, and space
127
  last_10_months = last_10_months.drop(columns=["model", "space", "dataset"])
@@ -130,10 +136,17 @@ def cumulative_growth_plot_analysis(df, df_compressed):
130
  # drop the first row
131
  last_10_months = last_10_months.drop(last_10_months.index[0])
132
  # order the columns date, total, total_change
133
- last_10_months = last_10_months[["date", "total_change"]]
 
 
134
  # rename the columns
135
  last_10_months = last_10_months.rename(
136
- columns={"date": "Date", "total_change": "Month-to-Month Growth (PBs)"}
 
 
 
 
 
137
  )
138
 
139
  # Create a Plotly figure
@@ -146,6 +159,22 @@ def cumulative_growth_plot_analysis(df, df_compressed):
146
  "dataset": px.colors.qualitative.Alphabet[9],
147
  }
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  # Add a scatter trace for each type
150
  for column in cumulative_df.columns:
151
  fig.add_trace(
@@ -165,7 +194,7 @@ def cumulative_growth_plot_analysis(df, df_compressed):
165
  x=cumulative_df_compressed.index,
166
  y=cumulative_df_compressed[column] / 1e15, # Convert to petabytes
167
  mode="lines",
168
- name=column.capitalize() + " (Compressed)",
169
  line=dict(color=color_map.get(column, "black"), dash="dash"),
170
  )
171
  )
@@ -183,7 +212,7 @@ def cumulative_growth_plot_analysis(df, df_compressed):
183
 
184
  def plot_total_sum(by_type_arr):
185
  # Sort the array by size in decreasing order
186
- by_type_arr = sorted(by_type_arr, key=lambda x: x[1], reverse=True)
187
 
188
  # Create a Plotly figure
189
  fig = go.Figure()
@@ -221,7 +250,9 @@ def filter_by_extension_month(_df, _extension):
221
  fig (Figure): The Plotly figure object representing the line plot.
222
  """
223
  # Filter the DataFrame by the specified extension or extensions
224
- if len(_extension) == 1 and "All" in _extension or len(_extension) == 0:
 
 
225
  pass
226
  else:
227
  _df = _df[_df["extension"].isin(_extension)].copy()
@@ -263,10 +294,13 @@ def filter_by_extension_month(_df, _extension):
263
 
264
  # Create a gradio blocks interface and launch a demo
265
  with gr.Blocks() as demo:
266
- df, file_df, by_type, by_extension, by_extension_month = process_dataset()
267
 
268
- # Add a heading
269
- gr.Markdown("# Git LFS Analysis Across the Hub")
 
 
 
270
  gr.Markdown(
271
  "The Hugging Face Hub has just crossed 1,000,000 models - but where is all that data stored? The short answer is Git LFS. This analysis dives into the LFS storage on the Hub, breaking down the data by repository type, file extension, and growth over time."
272
  )
@@ -274,28 +308,39 @@ with gr.Blocks() as demo:
274
  gr.Markdown(
275
  "Now, you might ask yourself, 'Why are you doing this?' Well, the [Xet Team](https://huggingface.co/xet-team) is a [new addition to Hugging Face](https://huggingface.co/blog/xethub-joins-hf), bringing a new way to store massive datasets and models to enable ML teams to operate like software teams: Quickly and without friction. Because this story all starts with storage, that's where we've begun with our own deep dives into what the Hub holds. As part of this, we've included a look at what happens with just one simple deduplication strategy - deduplicating at the file level. Read on to see more!"
276
  )
 
 
 
 
 
277
  with gr.Row():
278
- # scale so that
279
- # group the data by month and year and compute a cumulative sum of the total_size column
280
- fig, last_10_months = cumulative_growth_plot_analysis(df, file_df)
281
  with gr.Column(scale=1):
282
- gr.Markdown("# Repository Growth")
283
  gr.Markdown(
284
- "The cumulative growth of models, spaces, and datasets over time can be seen in the adjacent chart. Beside that is a view of the total change, from the previous month to the current one, of LFS files stored on the hub over 2024. We're averaging nearly **2.3 PBs uploaded to LFS per month!**"
285
  )
286
- gr.Dataframe(last_10_months, height=250)
287
  with gr.Column(scale=3):
288
- gr.Plot(fig)
 
 
 
 
 
 
 
 
 
 
289
  with gr.Row():
290
  with gr.Column(scale=1):
291
  gr.Markdown(
292
- "This table shows the total number of files and cumulative size of those files across all repositories on the Hub. These numbers might be hard to grok, so let's try to put them in context. The last [Common Crawl](https://commoncrawl.org/) download was [451 TBs](https://github.com/commoncrawl/cc-crawl-statistics/blob/master/stats/crawler/CC-MAIN-2024-38.json#L31). The Spaces repositories alone outpaces that. Meanwhile, between Datasets and Model repos, the Hub stores **64 Common Crawls** 🤯."
293
  )
294
  with gr.Column(scale=3):
295
- gr.Dataframe(by_type)
296
 
297
- # Add a heading
298
- gr.Markdown("## File Extension Analysis")
 
299
  gr.Markdown(
300
  "Breaking this down by file extension, some interesting trends emerge. [Safetensors](https://huggingface.co/docs/safetensors/en/index) are quickly becoming the defacto standard on the hub, accounting for over 7PBs (25%) of LFS storage. The top 20 file extensions seen here and in the table below account for 82% of all LFS storage on the hub."
301
  )
@@ -330,7 +375,14 @@ with gr.Blocks() as demo:
330
  )
331
  gr.Dataframe(by_extension_size)
332
 
 
333
  gr.Markdown("## File Extension Growth Over Time")
 
 
 
 
 
 
334
  gr.Markdown(
335
  "Want to dig a little deeper? Select a file extension to see how many bytes of that type were uploaded to the Hub each month."
336
  )
@@ -338,8 +390,6 @@ with gr.Blocks() as demo:
338
  # build a dropdown using the unique values in the extension column
339
  extension = gr.Dropdown(
340
  choices=by_extension["extension"].unique().tolist(),
341
- value="All",
342
- allow_custom_value=True,
343
  multiselect=True,
344
  )
345
  _by_extension_month = gr.State(by_extension_month)
 
40
  # drop any nas
41
  file_extensions_by_month = file_extensions_by_month.dropna()
42
 
 
 
 
 
 
43
  file_counts_and_sizes["type"] = file_counts_and_sizes["type"].str.capitalize()
44
  # update the column name to 'total size (PB)'
45
  file_counts_and_sizes = file_counts_and_sizes.rename(
 
49
  "total_size": "Total Size (PBs)",
50
  }
51
  )
52
+ file_counts_and_sizes = file_counts_and_sizes.drop(columns=["Number of Files"])
53
+
54
  # sort the dataframe by total size in descending order
55
  file_counts_and_sizes = file_counts_and_sizes.sort_values(
56
  by="Total Size (PBs)", ascending=False
 
77
  return _df
78
 
79
 
80
+ def cumulative_growth_plot_analysis(df, df_compressed, repo_sizes):
81
  """
82
  Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
83
 
 
118
  last_10_months = cumulative_df.tail(10).copy()
119
  last_10_months["total"] = last_10_months.sum(axis=1)
120
  last_10_months["total_change"] = last_10_months["total"].diff()
121
+ last_10_months["compressed_change"] = (
122
+ cumulative_df_compressed.tail(10).sum(axis=1).diff()
123
+ )
124
+ last_10_months["savings"] = (
125
+ last_10_months["total_change"] - last_10_months["compressed_change"]
126
+ )
127
+ last_10_months = format_dataframe_size_column(last_10_months, "savings")
128
+ last_10_months = format_dataframe_size_column(last_10_months, "compressed_change")
129
  last_10_months = format_dataframe_size_column(last_10_months, "total_change")
130
+
131
  last_10_months["date"] = cumulative_df.tail(10).index
132
  # drop the dataset, model, and space
133
  last_10_months = last_10_months.drop(columns=["model", "space", "dataset"])
 
136
  # drop the first row
137
  last_10_months = last_10_months.drop(last_10_months.index[0])
138
  # order the columns date, total, total_change
139
+ last_10_months = last_10_months[
140
+ ["date", "total_change", "compressed_change", "savings"]
141
+ ]
142
  # rename the columns
143
  last_10_months = last_10_months.rename(
144
+ columns={
145
+ "date": "Date",
146
+ "total_change": "Month-to-Month Growth (PBs)",
147
+ "compressed_change": "Growth with File-Level Deduplication (PBs)",
148
+ "savings": "Dedupe Savings (PBs)",
149
+ }
150
  )
151
 
152
  # Create a Plotly figure
 
159
  "dataset": px.colors.qualitative.Alphabet[9],
160
  }
161
 
162
+ # create a new column in the repository sizes dataframe for "compressed size" and set it to empty atif rist
163
+ repo_sizes["Compressed Size (PBs)"] = ""
164
+ repo_sizes["Dedupe Savings (PBs)"] = ""
165
+
166
+ for column in cumulative_df.columns:
167
+ cum_repo_size = cumulative_df[column].iloc[-1]
168
+ comp_repo_size = cumulative_df_compressed[column].iloc[-1]
169
+ repo_size_diff = cum_repo_size - comp_repo_size
170
+ repo_sizes.loc[
171
+ repo_sizes["Repository Type"] == column.capitalize(),
172
+ "Compressed Size (PBs)",
173
+ ] = comp_repo_size
174
+ repo_sizes.loc[
175
+ repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
176
+ ] = repo_size_diff
177
+
178
  # Add a scatter trace for each type
179
  for column in cumulative_df.columns:
180
  fig.add_trace(
 
194
  x=cumulative_df_compressed.index,
195
  y=cumulative_df_compressed[column] / 1e15, # Convert to petabytes
196
  mode="lines",
197
+ name=column.capitalize() + " (File-Level Deduplication)",
198
  line=dict(color=color_map.get(column, "black"), dash="dash"),
199
  )
200
  )
 
212
 
213
  def plot_total_sum(by_type_arr):
214
  # Sort the array by size in decreasing order
215
+ by_type_arr = sorted(by_type_arr, key=lambda x: x[1])
216
 
217
  # Create a Plotly figure
218
  fig = go.Figure()
 
250
  fig (Figure): The Plotly figure object representing the line plot.
251
  """
252
  # Filter the DataFrame by the specified extension or extensions
253
+ if _extension is None:
254
+ pass
255
+ elif len(_extension) == 0:
256
  pass
257
  else:
258
  _df = _df[_df["extension"].isin(_extension)].copy()
 
294
 
295
  # Create a gradio blocks interface and launch a demo
296
  with gr.Blocks() as demo:
297
+ df, file_df, by_repo_type, by_extension, by_extension_month = process_dataset()
298
 
299
+ # get the figure for the cumulative growth plot and the last 10 months dataframe
300
+ fig, last_10_months = cumulative_growth_plot_analysis(df, file_df, by_repo_type)
301
+
302
+ # Add top level heading and introduction text
303
+ gr.Markdown("# Git LFS Usage Across the Hub")
304
  gr.Markdown(
305
  "The Hugging Face Hub has just crossed 1,000,000 models - but where is all that data stored? The short answer is Git LFS. This analysis dives into the LFS storage on the Hub, breaking down the data by repository type, file extension, and growth over time."
306
  )
 
308
  gr.Markdown(
309
  "Now, you might ask yourself, 'Why are you doing this?' Well, the [Xet Team](https://huggingface.co/xet-team) is a [new addition to Hugging Face](https://huggingface.co/blog/xethub-joins-hf), bringing a new way to store massive datasets and models to enable ML teams to operate like software teams: Quickly and without friction. Because this story all starts with storage, that's where we've begun with our own deep dives into what the Hub holds. As part of this, we've included a look at what happens with just one simple deduplication strategy - deduplicating at the file level. Read on to see more!"
310
  )
311
+ gr.HTML("<div style='height: 20px;'></div>")
312
+ # Cumulative growth analysis
313
+ gr.Markdown("## Repository Growth")
314
+ with gr.Row():
315
+ gr.Plot(fig)
316
  with gr.Row():
 
 
 
317
  with gr.Column(scale=1):
 
318
  gr.Markdown(
319
+ "This table shows the total number of files and cumulative size of those files across all repositories on the Hub. These numbers might be hard to grok, so let's try to put them in context. The last [Common Crawl](https://commoncrawl.org/) download was [451 TBs](https://github.com/commoncrawl/cc-crawl-statistics/blob/master/stats/crawler/CC-MAIN-2024-38.json#L31). The Spaces repositories alone outpaces that. Meanwhile, between Datasets and Model repos, the Hub stores **64 Common Crawls** 🤯."
320
  )
 
321
  with gr.Column(scale=3):
322
+ # Convert the total size to petabytes and format to two decimal places
323
+ by_repo_type = format_dataframe_size_column(
324
+ by_repo_type, "Total Size (PBs)"
325
+ )
326
+ by_repo_type = format_dataframe_size_column(
327
+ by_repo_type, "Compressed Size (PBs)"
328
+ )
329
+ by_repo_type = format_dataframe_size_column(
330
+ by_repo_type, "Dedupe Savings (PBs)"
331
+ )
332
+ gr.Dataframe(by_repo_type)
333
  with gr.Row():
334
  with gr.Column(scale=1):
335
  gr.Markdown(
336
+ "The cumulative growth of models, spaces, and datasets over time can be seen in the adjacent chart. Beside that is a view of the total change, from the previous month to the current one, of LFS files stored on the hub over 2024. We're averaging nearly **2.3 PBs uploaded to LFS per month!**"
337
  )
338
  with gr.Column(scale=3):
339
+ gr.Dataframe(last_10_months, height=250)
340
 
341
+ gr.HTML("<div style='height: 20px;'></div>")
342
+ # File Extension analysis
343
+ gr.Markdown("## File Extensions on the Hub")
344
  gr.Markdown(
345
  "Breaking this down by file extension, some interesting trends emerge. [Safetensors](https://huggingface.co/docs/safetensors/en/index) are quickly becoming the defacto standard on the hub, accounting for over 7PBs (25%) of LFS storage. The top 20 file extensions seen here and in the table below account for 82% of all LFS storage on the hub."
346
  )
 
375
  )
376
  gr.Dataframe(by_extension_size)
377
 
378
+ gr.HTML("<div style='height: 20px;'></div>")
379
  gr.Markdown("## File Extension Growth Over Time")
380
+ by_extension_month["date"] = pd.to_datetime(
381
+ by_extension_month[["year", "month"]].assign(day=1)
382
+ )
383
+ # make a plotly area chart with data and extension
384
+ figure = px.area(by_extension_month, x="date", y="total_size", color="extension")
385
+ gr.Plot(figure)
386
  gr.Markdown(
387
  "Want to dig a little deeper? Select a file extension to see how many bytes of that type were uploaded to the Hub each month."
388
  )
 
390
  # build a dropdown using the unique values in the extension column
391
  extension = gr.Dropdown(
392
  choices=by_extension["extension"].unique().tolist(),
 
 
393
  multiselect=True,
394
  )
395
  _by_extension_month = gr.State(by_extension_month)
pyproject.toml CHANGED
@@ -5,6 +5,7 @@ description = ""
5
  authors = ["jsulz <j.sulzdorf@gmail.com>"]
6
  license = "MIT"
7
  readme = "README.md"
 
8
 
9
  [tool.poetry.dependencies]
10
  python = "^3.12"
 
5
  authors = ["jsulz <j.sulzdorf@gmail.com>"]
6
  license = "MIT"
7
  readme = "README.md"
8
+ package-mode = false
9
 
10
  [tool.poetry.dependencies]
11
  python = "^3.12"