Update README.md
#4
by
reach-vb
HF staff
- opened
README.md
CHANGED
@@ -119,8 +119,8 @@ Whisper is a state-of-the-art model for automatic speech recognition (ASR) and s
|
|
119 |
et al. from OpenAI. Trained on >5M hours of labeled data, Whisper demonstrates a strong ability to generalise to many
|
120 |
datasets and domains in a zero-shot setting.
|
121 |
|
122 |
-
Whisper large-v3-turbo is a
|
123 |
-
As a result, the model is way faster, at the expense of a minor quality degradation.
|
124 |
|
125 |
**Disclaimer**: Content for this model card has partly been written by the π€ Hugging Face team, and partly copied and
|
126 |
pasted from the original model card.
|
@@ -148,7 +148,7 @@ from datasets import load_dataset
|
|
148 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
149 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
150 |
|
151 |
-
model_id = "
|
152 |
|
153 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
154 |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
@@ -252,7 +252,7 @@ from datasets import Audio, load_dataset
|
|
252 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
253 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
254 |
|
255 |
-
model_id = "
|
256 |
|
257 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
258 |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
|
@@ -327,7 +327,7 @@ from datasets import load_dataset
|
|
327 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
328 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
329 |
|
330 |
-
model_id = "
|
331 |
|
332 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
333 |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
|
@@ -373,7 +373,7 @@ torch.set_float32_matmul_precision("high")
|
|
373 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
374 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
375 |
|
376 |
-
model_id = "
|
377 |
|
378 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
379 |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
|
@@ -472,7 +472,7 @@ checkpoints are summarised in the following table with links to the models on th
|
|
472 |
| large | 1550 M | x | [β](https://huggingface.co/openai/whisper-large) |
|
473 |
| large-v2 | 1550 M | x | [β](https://huggingface.co/openai/whisper-large-v2) |
|
474 |
| large-v3 | 1550 M | x | [β](https://huggingface.co/openai/whisper-large-v3) |
|
475 |
-
| large-v3-turbo | 809 M | x | [β](https://huggingface.co/
|
476 |
|
477 |
|
478 |
## Fine-Tuning
|
|
|
119 |
et al. from OpenAI. Trained on >5M hours of labeled data, Whisper demonstrates a strong ability to generalise to many
|
120 |
datasets and domains in a zero-shot setting.
|
121 |
|
122 |
+
Whisper large-v3-turbo is a finetuned version of a pruned [Whisper large-v3](https://huggingface.co/openai/whisper-large-v3). In other words, it's the exact same model, except that the number of decoding layers have reduced from 32 to 4.
|
123 |
+
As a result, the model is way faster, at the expense of a minor quality degradation. You can find more details about it [in this GitHub discussion](https://github.com/openai/whisper/discussions/2363).
|
124 |
|
125 |
**Disclaimer**: Content for this model card has partly been written by the π€ Hugging Face team, and partly copied and
|
126 |
pasted from the original model card.
|
|
|
148 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
149 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
150 |
|
151 |
+
model_id = "openai/whisper-large-v3-turbo"
|
152 |
|
153 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
154 |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
|
|
252 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
253 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
254 |
|
255 |
+
model_id = "openai/whisper-large-v3-turbo"
|
256 |
|
257 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
258 |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
|
|
|
327 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
328 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
329 |
|
330 |
+
model_id = "openai/whisper-large-v3-turbo"
|
331 |
|
332 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
333 |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
|
|
|
373 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
374 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
375 |
|
376 |
+
model_id = "openai/whisper-large-v3-turbo"
|
377 |
|
378 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
379 |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
|
|
|
472 |
| large | 1550 M | x | [β](https://huggingface.co/openai/whisper-large) |
|
473 |
| large-v2 | 1550 M | x | [β](https://huggingface.co/openai/whisper-large-v2) |
|
474 |
| large-v3 | 1550 M | x | [β](https://huggingface.co/openai/whisper-large-v3) |
|
475 |
+
| large-v3-turbo | 809 M | x | [β](https://huggingface.co/openai/whisper-large-v3-turbo) |
|
476 |
|
477 |
|
478 |
## Fine-Tuning
|