thomwolf HF staff commited on
Commit
0f43891
·
1 Parent(s): 7109720
assets/data/benchmarks/tp_scaling.html ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ <div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
2
+ <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script> <div id="1c31d751-4554-4c49-b217-074140865d95" class="plotly-graph-div" style="height:400px; width:1000px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("1c31d751-4554-4c49-b217-074140865d95")) { Plotly.newPlot( "1c31d751-4554-4c49-b217-074140865d95", [{"marker":{"color":"#4ea5b7"},"name":"Tokens\u002fsec\u002fGPU","width":0.7,"x":["2","4","8","16","32"],"y":[13923.18,12420.76,10903.32,6245.6,2146.44],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[12420.76],"marker":{"color":"#e889ab"},"name":"Performance Drop","showlegend":true,"width":0.0875,"x":["4"],"y":[1502.42],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[10903.32],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["8"],"y":[1517.4400000000005],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[6245.6],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["16"],"y":[4657.719999999999],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[2146.44],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["32"],"y":[4099.16],"type":"bar","xaxis":"x","yaxis":"y"},{"marker":{"color":"#cec0fa"},"name":"Max Batch Size","text":["3","8","12","16","20"],"textposition":"inside","width":0.7,"x":["2","4","8","16","32"],"y":[3,8,12,16,20],"type":"bar","xaxis":"x2","yaxis":"y2"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.45],"title":{"text":"Tensor Parallelism (TP)"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"Tokens\u002fsec\u002fGPU"},"showgrid":true,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.55,1.0],"title":{"text":"Tensor Parallelism (TP)"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"title":{"text":"Maximum Batch Size"},"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Throughput Scaling with TP (3B Model)","x":0.225,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Maximum Batch Size per TP Value","x":0.775,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-10.8%","x":1,"xanchor":"center","xref":"x","xshift":30,"y":13171.970000000001,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-12.2%","x":2,"xanchor":"center","xref":"x","xshift":30,"y":11662.04,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-42.7%","x":3,"xanchor":"center","xref":"x","xshift":30,"y":8574.46,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-65.6%","x":4,"xanchor":"center","xref":"x","xshift":30,"y":4196.02,"yanchor":"middle","yref":"y"}],"legend":{"x":0.55,"y":1.0},"width":1000,"height":400,"barmode":"stack"}, {"responsive": true, "scrollZoom": false} ) }; </script> </div>
assets/images/tp_diagram.svg ADDED
dist/assets/data/benchmarks/tp_scaling.html ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ <div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
2
+ <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script> <div id="1c31d751-4554-4c49-b217-074140865d95" class="plotly-graph-div" style="height:400px; width:1000px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("1c31d751-4554-4c49-b217-074140865d95")) { Plotly.newPlot( "1c31d751-4554-4c49-b217-074140865d95", [{"marker":{"color":"#4ea5b7"},"name":"Tokens\u002fsec\u002fGPU","width":0.7,"x":["2","4","8","16","32"],"y":[13923.18,12420.76,10903.32,6245.6,2146.44],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[12420.76],"marker":{"color":"#e889ab"},"name":"Performance Drop","showlegend":true,"width":0.0875,"x":["4"],"y":[1502.42],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[10903.32],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["8"],"y":[1517.4400000000005],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[6245.6],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["16"],"y":[4657.719999999999],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[2146.44],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["32"],"y":[4099.16],"type":"bar","xaxis":"x","yaxis":"y"},{"marker":{"color":"#cec0fa"},"name":"Max Batch Size","text":["3","8","12","16","20"],"textposition":"inside","width":0.7,"x":["2","4","8","16","32"],"y":[3,8,12,16,20],"type":"bar","xaxis":"x2","yaxis":"y2"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.45],"title":{"text":"Tensor Parallelism (TP)"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"Tokens\u002fsec\u002fGPU"},"showgrid":true,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.55,1.0],"title":{"text":"Tensor Parallelism (TP)"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"title":{"text":"Maximum Batch Size"},"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Throughput Scaling with TP (3B Model)","x":0.225,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Maximum Batch Size per TP Value","x":0.775,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-10.8%","x":1,"xanchor":"center","xref":"x","xshift":30,"y":13171.970000000001,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-12.2%","x":2,"xanchor":"center","xref":"x","xshift":30,"y":11662.04,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-42.7%","x":3,"xanchor":"center","xref":"x","xshift":30,"y":8574.46,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-65.6%","x":4,"xanchor":"center","xref":"x","xshift":30,"y":4196.02,"yanchor":"middle","yref":"y"}],"legend":{"x":0.55,"y":1.0},"width":1000,"height":400,"barmode":"stack"}, {"responsive": true, "scrollZoom": false} ) }; </script> </div>
dist/assets/images/tp_diagram.svg ADDED
dist/index.html CHANGED
@@ -849,7 +849,7 @@
849
 
850
  <p>In practice a small example of the operation looks like this:</p>
851
 
852
- <p><img alt="image.png" src="/assets/images/tp_diagram.png" /></p>
853
 
854
  <p>Let’s see how we can parallelise this operation! In tensor parallelism, tensors will be split into N shards along a particular dimension and distributed across N GPUs. Matrices can be split either on the column part or row part leading to row and column parallelism. One thing we’ll see in the following is that choosing row or column sharding will require different communications primitives.</p>
855
 
@@ -911,7 +911,16 @@
911
 
912
  <p>Tensor parallelism does help reduce activation memory for the matrix multiplications since the intermediate activations are sharded across GPUs. However, we still need to gather the full activations for operations like LayerNorm, which means we're not getting the full memory benefits we could. Additionally, it introduces significant communication requirements that heavily depend on the network infrastructure. The inability to hide this particular AllReduce behind computation means it directly adds to the critical path of forward propagation.</p>
913
 
914
- <p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p>
 
 
 
 
 
 
 
 
 
915
 
916
  <p>Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training.</p>
917
 
@@ -1262,7 +1271,6 @@
1262
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1263
  });
1264
  </script>
1265
-
1266
  <!-- <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p> -->
1267
 
1268
  <p>Looking at the figure above, we notice something interesting: while the parameters are nicely split across GPUs, the activation memory remains the same on each GPU! This is because each GPU still needs to process the full batch of data, just with different layers. The activations from one GPU's layers need to be sent to the next GPU to continue the forward pass.</p>
@@ -1395,7 +1403,6 @@
1395
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1396
  });
1397
  </script>
1398
-
1399
  <!-- <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p> -->
1400
 
1401
 
 
849
 
850
  <p>In practice a small example of the operation looks like this:</p>
851
 
852
+ <p><img class"l-body" alt="TP diagram" src="/assets/images/tp_diagram.svg" /></p>
853
 
854
  <p>Let’s see how we can parallelise this operation! In tensor parallelism, tensors will be split into N shards along a particular dimension and distributed across N GPUs. Matrices can be split either on the column part or row part leading to row and column parallelism. One thing we’ll see in the following is that choosing row or column sharding will require different communications primitives.</p>
855
 
 
911
 
912
  <p>Tensor parallelism does help reduce activation memory for the matrix multiplications since the intermediate activations are sharded across GPUs. However, we still need to gather the full activations for operations like LayerNorm, which means we're not getting the full memory benefits we could. Additionally, it introduces significant communication requirements that heavily depend on the network infrastructure. The inability to hide this particular AllReduce behind computation means it directly adds to the critical path of forward propagation.</p>
913
 
914
+ <iframe class="l-body-outset" id="plotFrame13" src="assets/data/benchmarks/tp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
915
+ <script>
916
+ window.addEventListener('load', function() {
917
+ const frame = document.getElementById('plotFrame13');
918
+ frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
919
+ frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
920
+ });
921
+ </script>
922
+ <!--
923
+ <p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p> -->
924
 
925
  <p>Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training.</p>
926
 
 
1271
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1272
  });
1273
  </script>
 
1274
  <!-- <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p> -->
1275
 
1276
  <p>Looking at the figure above, we notice something interesting: while the parameters are nicely split across GPUs, the activation memory remains the same on each GPU! This is because each GPU still needs to process the full batch of data, just with different layers. The activations from one GPU's layers need to be sent to the next GPU to continue the forward pass.</p>
 
1403
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1404
  });
1405
  </script>
 
1406
  <!-- <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p> -->
1407
 
1408
 
src/index.html CHANGED
@@ -849,7 +849,7 @@
849
 
850
  <p>In practice a small example of the operation looks like this:</p>
851
 
852
- <p><img alt="image.png" src="/assets/images/tp_diagram.png" /></p>
853
 
854
  <p>Let’s see how we can parallelise this operation! In tensor parallelism, tensors will be split into N shards along a particular dimension and distributed across N GPUs. Matrices can be split either on the column part or row part leading to row and column parallelism. One thing we’ll see in the following is that choosing row or column sharding will require different communications primitives.</p>
855
 
@@ -911,7 +911,16 @@
911
 
912
  <p>Tensor parallelism does help reduce activation memory for the matrix multiplications since the intermediate activations are sharded across GPUs. However, we still need to gather the full activations for operations like LayerNorm, which means we're not getting the full memory benefits we could. Additionally, it introduces significant communication requirements that heavily depend on the network infrastructure. The inability to hide this particular AllReduce behind computation means it directly adds to the critical path of forward propagation.</p>
913
 
914
- <p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p>
 
 
 
 
 
 
 
 
 
915
 
916
  <p>Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training.</p>
917
 
@@ -1262,7 +1271,6 @@
1262
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1263
  });
1264
  </script>
1265
-
1266
  <!-- <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p> -->
1267
 
1268
  <p>Looking at the figure above, we notice something interesting: while the parameters are nicely split across GPUs, the activation memory remains the same on each GPU! This is because each GPU still needs to process the full batch of data, just with different layers. The activations from one GPU's layers need to be sent to the next GPU to continue the forward pass.</p>
@@ -1395,7 +1403,6 @@
1395
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1396
  });
1397
  </script>
1398
-
1399
  <!-- <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p> -->
1400
 
1401
 
 
849
 
850
  <p>In practice a small example of the operation looks like this:</p>
851
 
852
+ <p><img class"l-body" alt="TP diagram" src="/assets/images/tp_diagram.svg" /></p>
853
 
854
  <p>Let’s see how we can parallelise this operation! In tensor parallelism, tensors will be split into N shards along a particular dimension and distributed across N GPUs. Matrices can be split either on the column part or row part leading to row and column parallelism. One thing we’ll see in the following is that choosing row or column sharding will require different communications primitives.</p>
855
 
 
911
 
912
  <p>Tensor parallelism does help reduce activation memory for the matrix multiplications since the intermediate activations are sharded across GPUs. However, we still need to gather the full activations for operations like LayerNorm, which means we're not getting the full memory benefits we could. Additionally, it introduces significant communication requirements that heavily depend on the network infrastructure. The inability to hide this particular AllReduce behind computation means it directly adds to the critical path of forward propagation.</p>
913
 
914
+ <iframe class="l-body-outset" id="plotFrame13" src="assets/data/benchmarks/tp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
915
+ <script>
916
+ window.addEventListener('load', function() {
917
+ const frame = document.getElementById('plotFrame13');
918
+ frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
919
+ frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
920
+ });
921
+ </script>
922
+ <!--
923
+ <p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p> -->
924
 
925
  <p>Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training.</p>
926
 
 
1271
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1272
  });
1273
  </script>
 
1274
  <!-- <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p> -->
1275
 
1276
  <p>Looking at the figure above, we notice something interesting: while the parameters are nicely split across GPUs, the activation memory remains the same on each GPU! This is because each GPU still needs to process the full batch of data, just with different layers. The activations from one GPU's layers need to be sent to the next GPU to continue the forward pass.</p>
 
1403
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1404
  });
1405
  </script>
 
1406
  <!-- <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p> -->
1407
 
1408