Julien Simon commited on
Commit
b63ff12
1 Parent(s): 6094dcf
Files changed (5) hide show
  1. app.py +1 -1
  2. results.py +1 -0
  3. results_arcee_meraj.py +24 -0
  4. results_arcee_nova.py +23 -0
  5. results_llama_spark.py +36 -10
app.py CHANGED
@@ -202,7 +202,7 @@ with gr.Blocks() as demo:
202
  """This table shows the benchmark results for each model. \n\n
203
  Configurations are default unless noted.
204
  [TGI](https://huggingface.co/docs/text-generation-inference/reference/launcher),
205
- [vLLM](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/user_guides/vllm_user_guide.html), etc.)are default unless noted."""
206
  )
207
  model_dropdown = gr.Dropdown(choices=get_model_names(), label="Select Model")
208
 
 
202
  """This table shows the benchmark results for each model. \n\n
203
  Configurations are default unless noted.
204
  [TGI](https://huggingface.co/docs/text-generation-inference/reference/launcher),
205
+ [vLLM](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/user_guides/vllm_user_guide.html)"""
206
  )
207
  model_dropdown = gr.Dropdown(choices=get_model_names(), label="Select Model")
208
 
results.py CHANGED
@@ -18,6 +18,7 @@ instance_type_mappings = {
18
  "g6.12xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA L4", "gpuRAM": "96 GB"},
19
  "g6.48xlarge": {"cloud": "AWS", "gpu": "8xNVIDIA L4", "gpuRAM": "192 GB"},
20
  "g6e.2xlarge": {"cloud": "AWS", "gpu": "1xNVIDIA L40S", "gpuRAM": "48 GB"},
 
21
  "g4dn.12xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA T4", "gpuRAM": "64 GB"},
22
  "p4d.24xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA A100", "gpuRAM": "320 GB"},
23
  "p4de.24xlarge": {"cloud": "AWS", "gpu": "8xNVIDIA A100", "gpuRAM": "320 GB"},
 
18
  "g6.12xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA L4", "gpuRAM": "96 GB"},
19
  "g6.48xlarge": {"cloud": "AWS", "gpu": "8xNVIDIA L4", "gpuRAM": "192 GB"},
20
  "g6e.2xlarge": {"cloud": "AWS", "gpu": "1xNVIDIA L40S", "gpuRAM": "48 GB"},
21
+ "g6e.12xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA L40S", "gpuRAM": "192 GB"},
22
  "g4dn.12xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA T4", "gpuRAM": "64 GB"},
23
  "p4d.24xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA A100", "gpuRAM": "320 GB"},
24
  "p4de.24xlarge": {"cloud": "AWS", "gpu": "8xNVIDIA A100", "gpuRAM": "320 GB"},
results_arcee_meraj.py CHANGED
@@ -12,6 +12,30 @@ results_arcee_meraj = {
12
  "tokensPerSecond": "33",
13
  "notes": "",
14
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  {
16
  "instanceType": "p4d.24xlarge",
17
  "quantization": "none",
 
12
  "tokensPerSecond": "33",
13
  "notes": "",
14
  },
15
+ {
16
+ "instanceType": "g6e.12xlarge",
17
+ "quantization": "awq",
18
+ "container": "vLLM 0.5.5",
19
+ "status": "OK",
20
+ "tokensPerSecond": "45",
21
+ "notes": "",
22
+ },
23
+ {
24
+ "instanceType": "g6e.12xlarge",
25
+ "quantization": "awq",
26
+ "container": "TGI 2.2.0",
27
+ "status": "OK",
28
+ "tokensPerSecond": "46",
29
+ "notes": "",
30
+ },
31
+ {
32
+ "instanceType": "g6e.12xlarge",
33
+ "quantization": "awq",
34
+ "container": "SGLang 0.2.13",
35
+ "status": "OK",
36
+ "tokensPerSecond": "47.1",
37
+ "notes": "",
38
+ },
39
  {
40
  "instanceType": "p4d.24xlarge",
41
  "quantization": "none",
results_arcee_nova.py CHANGED
@@ -108,6 +108,29 @@ results_arcee_nova = {
108
  "status": "OK",
109
  "tokensPerSecond": "12",
110
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  {
112
  "instanceType": "p4d.24xlarge",
113
  "quantization": "none",
 
108
  "status": "OK",
109
  "tokensPerSecond": "12",
110
  },
111
+ {
112
+ "instanceType": "g6e.12xlarge",
113
+ "configurations": [
114
+ {
115
+ "quantization": "none",
116
+ "container": "TGI 2.2.0",
117
+ "status": "OK",
118
+ "tokensPerSecond": "17",
119
+ },
120
+ {
121
+ "quantization": "none",
122
+ "container": "vLLM 0.5.5",
123
+ "status": "OK",
124
+ "tokensPerSecond": "17.8",
125
+ },
126
+ {
127
+ "quantization": "none",
128
+ "container": "SGLang 0.2.13",
129
+ "status": "OK",
130
+ "tokensPerSecond": "18.2",
131
+ },
132
+ ],
133
+ },
134
  {
135
  "instanceType": "p4d.24xlarge",
136
  "quantization": "none",
results_llama_spark.py CHANGED
@@ -46,22 +46,48 @@ results_llama_spark = {
46
  },
47
  {
48
  "instanceType": "g6e.2xlarge",
49
- "quantization": "none",
50
- "status": "OK",
51
  "configurations": [
52
- {"container": "TGI 2.2.0", "tokensPerSecond": "42.1"},
53
- {"container": "SGLang 0.2.13", "tokensPerSecond": "45"},
54
- {"container": "vLLM 0.5.5", "tokensPerSecond": "43.4"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  ],
56
  },
57
  {
58
  "instanceType": "g6e.12xlarge",
59
- "quantization": "none",
60
- "status": "OK",
61
  "configurations": [
62
- {"container": "TGI 2.2.0", "tokensPerSecond": "112"},
63
- {"container": "SGLang 0.2.13", "tokensPerSecond": "123"},
64
- {"container": "vLLM 0.5.5", "tokensPerSecond": "106"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  ],
66
  },
67
  {
 
46
  },
47
  {
48
  "instanceType": "g6e.2xlarge",
 
 
49
  "configurations": [
50
+ {
51
+ "container": "TGI 2.2.0",
52
+ "quantization": "none",
53
+ "status": "OK",
54
+ "tokensPerSecond": "42.1",
55
+ },
56
+ {
57
+ "container": "SGLang 0.2.13",
58
+ "quantization": "none",
59
+ "status": "OK",
60
+ "tokensPerSecond": "45",
61
+ },
62
+ {
63
+ "container": "vLLM 0.5.5",
64
+ "quantization": "none",
65
+ "status": "OK",
66
+ "tokensPerSecond": "43.4",
67
+ },
68
  ],
69
  },
70
  {
71
  "instanceType": "g6e.12xlarge",
 
 
72
  "configurations": [
73
+ {
74
+ "container": "TGI 2.2.0",
75
+ "quantization": "none",
76
+ "status": "OK",
77
+ "tokensPerSecond": "112",
78
+ },
79
+ {
80
+ "container": "SGLang 0.2.13",
81
+ "quantization": "none",
82
+ "status": "OK",
83
+ "tokensPerSecond": "123",
84
+ },
85
+ {
86
+ "container": "vLLM 0.5.5",
87
+ "quantization": "none",
88
+ "status": "OK",
89
+ "tokensPerSecond": "106",
90
+ },
91
  ],
92
  },
93
  {