whisper stuff
Browse filesSigned-off-by: Balazs Horvath <acsipont@gmail.com>
ogg2wav
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/zsh
|
2 |
+
|
3 |
+
# Function to convert ogg to wav
|
4 |
+
convert_ogg_to_wav() {
|
5 |
+
local input_file="$1"
|
6 |
+
local output_file="${input_file:r}.wav"
|
7 |
+
ffmpeg -i "$input_file" "$output_file"
|
8 |
+
echo "Converted: $input_file -> $output_file"
|
9 |
+
}
|
10 |
+
|
11 |
+
# Set the target directory
|
12 |
+
if [[ $# -eq 0 ]]; then
|
13 |
+
target_dir="."
|
14 |
+
else
|
15 |
+
target_dir="$1"
|
16 |
+
fi
|
17 |
+
|
18 |
+
# Check if the target directory exists
|
19 |
+
if [[ ! -d "$target_dir" ]]; then
|
20 |
+
echo "Error: Directory '$target_dir' does not exist."
|
21 |
+
exit 1
|
22 |
+
fi
|
23 |
+
|
24 |
+
# Find all .ogg files in the target directory and its subdirectories
|
25 |
+
ogg_files=($(find "$target_dir" -type f -name "*.ogg"))
|
26 |
+
|
27 |
+
# Check if any .ogg files were found
|
28 |
+
if [[ ${#ogg_files[@]} -eq 0 ]]; then
|
29 |
+
echo "No .ogg files found in '$target_dir' or its subdirectories."
|
30 |
+
exit 0
|
31 |
+
fi
|
32 |
+
|
33 |
+
# Convert each .ogg file to .wav
|
34 |
+
for file in "${ogg_files[@]}"; do
|
35 |
+
convert_ogg_to_wav "$file"
|
36 |
+
done
|
37 |
+
|
38 |
+
echo "Conversion complete."
|
whisper
CHANGED
@@ -1,86 +1,49 @@
|
|
1 |
#!/usr/bin/env python
|
2 |
# -*- coding: utf-8 -*-
|
3 |
|
4 |
-
"""
|
5 |
-
This script uses the Whisper large-v3-turbo model from OpenAI for automatic speech recognition (ASR).
|
6 |
-
The model is finetuned for faster performance with a minor quality trade-off. It leverages the Hugging Face
|
7 |
-
Transformers library to load the model and processor, and performs transcription on an input audio file.
|
8 |
-
|
9 |
-
Whisper is a state-of-the-art model for ASR and speech translation, proposed in the paper "Robust Speech
|
10 |
-
Recognition via Large-Scale Weak Supervision" by Alec Radford et al. from OpenAI. Trained on over 5 million
|
11 |
-
hours of labeled data, Whisper demonstrates a strong ability to generalize to many datasets and domains in
|
12 |
-
a zero-shot setting.
|
13 |
-
|
14 |
-
The script performs the following steps:
|
15 |
-
1. Checks if a CUDA-enabled GPU is available and sets the appropriate device and data type.
|
16 |
-
2. Loads the Whisper large-v3-turbo model and processor from the Hugging Face Hub.
|
17 |
-
3. Initializes an ASR pipeline using the model and processor.
|
18 |
-
4. Defines a function `transcribe_audio` that takes an audio file path as input, performs transcription,
|
19 |
-
and outputs the result to the terminal and a text file.
|
20 |
-
5. The script expects an audio file path as a command-line argument and calls the `transcribe_audio` function.
|
21 |
-
|
22 |
-
Usage:
|
23 |
-
whisper <audio_file>
|
24 |
-
|
25 |
-
Dependencies:
|
26 |
-
- torch
|
27 |
-
- transformers
|
28 |
-
- datasets
|
29 |
-
- accelerate
|
30 |
-
|
31 |
-
Example:
|
32 |
-
whisper sample_audio.wav
|
33 |
-
"""
|
34 |
-
|
35 |
import torch
|
36 |
-
from transformers import
|
37 |
import sys
|
38 |
import os
|
39 |
|
40 |
-
|
41 |
-
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
46 |
-
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
47 |
-
)
|
48 |
-
model.to(device)
|
49 |
-
|
50 |
-
processor = AutoProcessor.from_pretrained(model_id)
|
51 |
|
52 |
pipe = pipeline(
|
53 |
-
"automatic-speech-recognition",
|
54 |
-
model=
|
55 |
-
|
56 |
-
feature_extractor=processor.feature_extractor,
|
57 |
-
torch_dtype=torch_dtype,
|
58 |
device=device,
|
59 |
)
|
60 |
|
61 |
-
def
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
# Output the result to the terminal
|
73 |
-
print(result["text"])
|
74 |
-
|
75 |
-
# Save the result to a text file
|
76 |
-
with open(output_text_path, "w") as f:
|
77 |
-
f.write(result["text"])
|
78 |
|
79 |
if __name__ == "__main__":
|
80 |
-
if len(sys.argv)
|
81 |
-
print("Usage: python script.py <
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
sys.exit(1)
|
83 |
-
|
84 |
-
audio_file = sys.argv[1]
|
85 |
-
transcribe_audio(audio_file)
|
86 |
|
|
|
|
|
|
|
|
|
|
1 |
#!/usr/bin/env python
|
2 |
# -*- coding: utf-8 -*-
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import torch
|
5 |
+
from transformers import pipeline
|
6 |
import sys
|
7 |
import os
|
8 |
|
9 |
+
MODEL_NAME = "openai/whisper-large-v3-turbo"
|
10 |
+
BATCH_SIZE = 8
|
11 |
|
12 |
+
device = 0 if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
pipe = pipeline(
|
15 |
+
task="automatic-speech-recognition",
|
16 |
+
model=MODEL_NAME,
|
17 |
+
chunk_length_s=30,
|
|
|
|
|
18 |
device=device,
|
19 |
)
|
20 |
|
21 |
+
def transcribe(audio_file_path, task="transcribe"):
|
22 |
+
if not os.path.exists(audio_file_path):
|
23 |
+
print(f"Error: The file '{audio_file_path}' does not exist.")
|
24 |
+
return
|
25 |
+
|
26 |
+
try:
|
27 |
+
text = pipe(audio_file_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
|
28 |
+
return text
|
29 |
+
except Exception as e:
|
30 |
+
print(f"Error during transcription: {str(e)}")
|
31 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
if __name__ == "__main__":
|
34 |
+
if len(sys.argv) < 2:
|
35 |
+
print("Usage: python script.py <audio_file_path> [task]")
|
36 |
+
print("task can be 'transcribe' or 'translate' (default is 'transcribe')")
|
37 |
+
sys.exit(1)
|
38 |
+
|
39 |
+
audio_file_path = sys.argv[1]
|
40 |
+
task = sys.argv[2] if len(sys.argv) > 2 else "transcribe"
|
41 |
+
|
42 |
+
if task not in ["transcribe", "translate"]:
|
43 |
+
print("Error: task must be either 'transcribe' or 'translate'")
|
44 |
sys.exit(1)
|
|
|
|
|
|
|
45 |
|
46 |
+
result = transcribe(audio_file_path, task)
|
47 |
+
if result:
|
48 |
+
print("Transcription result:")
|
49 |
+
print(result)
|