k4d3
/

toolkit

Safetensors

Model card Files Files and versions Community

k4d3 commited on Oct 6, 2024

Commit

1f9dab2

•

1 Parent(s): fc8f518

whisper stuff

Browse files

Signed-off-by: Balazs Horvath <acsipont@gmail.com>

Files changed (2) hide show

ogg2wav +38 -0
whisper +32 -69

ogg2wav ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/bin/zsh
+# Function to convert ogg to wav
+convert_ogg_to_wav() {
+    local input_file="$1"
+    local output_file="${input_file:r}.wav"
+    ffmpeg -i "$input_file" "$output_file"
+    echo "Converted: $input_file -> $output_file"
+}
+# Set the target directory
+if [[ $# -eq 0 ]]; then
+    target_dir="."
+else
+    target_dir="$1"
+fi
+# Check if the target directory exists
+if [[ ! -d "$target_dir" ]]; then
+    echo "Error: Directory '$target_dir' does not exist."
+    exit 1
+fi
+# Find all .ogg files in the target directory and its subdirectories
+ogg_files=($(find "$target_dir" -type f -name "*.ogg"))
+# Check if any .ogg files were found
+if [[ ${#ogg_files[@]} -eq 0 ]]; then
+    echo "No .ogg files found in '$target_dir' or its subdirectories."
+    exit 0
+fi
+# Convert each .ogg file to .wav
+for file in "${ogg_files[@]}"; do
+    convert_ogg_to_wav "$file"
+done
+echo "Conversion complete."

whisper CHANGED Viewed

@@ -1,86 +1,49 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-"""
-This script uses the Whisper large-v3-turbo model from OpenAI for automatic speech recognition (ASR).
-The model is finetuned for faster performance with a minor quality trade-off. It leverages the Hugging Face
-Transformers library to load the model and processor, and performs transcription on an input audio file.
-Whisper is a state-of-the-art model for ASR and speech translation, proposed in the paper "Robust Speech
-Recognition via Large-Scale Weak Supervision" by Alec Radford et al. from OpenAI. Trained on over 5 million
-hours of labeled data, Whisper demonstrates a strong ability to generalize to many datasets and domains in
-a zero-shot setting.
-The script performs the following steps:
-1. Checks if a CUDA-enabled GPU is available and sets the appropriate device and data type.
-2. Loads the Whisper large-v3-turbo model and processor from the Hugging Face Hub.
-3. Initializes an ASR pipeline using the model and processor.
-4. Defines a function `transcribe_audio` that takes an audio file path as input, performs transcription,
-   and outputs the result to the terminal and a text file.
-5. The script expects an audio file path as a command-line argument and calls the `transcribe_audio` function.
-Usage:
-    whisper <audio_file>
-Dependencies:
-    - torch
-    - transformers
-    - datasets
-    - accelerate
-Example:
-    whisper sample_audio.wav
-"""
 import torch
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import sys
 import os
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-model_id = "openai/whisper-large-v3-turbo"
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
-)
-model.to(device)
-processor = AutoProcessor.from_pretrained(model_id)
 pipe = pipeline(
-    "automatic-speech-recognition",
-    model=model,
-    tokenizer=processor.tokenizer,
-    feature_extractor=processor.feature_extractor,
-    torch_dtype=torch_dtype,
     device=device,
 )
-def transcribe_audio(audio_path):
-    # Load audio file
-    audio = {"path": audio_path}
-    # Perform transcription
-    result = pipe(audio)
-    # Get the base filename and directory
-    base_filename = os.path.splitext(audio_path)[0]
-    output_text_path = base_filename + ".txt"
-    # Output the result to the terminal
-    print(result["text"])
-    # Save the result to a text file
-    with open(output_text_path, "w") as f:
-        f.write(result["text"])
 if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print("Usage: python script.py <audio_file>")
         sys.exit(1)
-    audio_file = sys.argv[1]
-    transcribe_audio(audio_file)

 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import torch
+from transformers import pipeline
 import sys
 import os
+MODEL_NAME = "openai/whisper-large-v3-turbo"
+BATCH_SIZE = 8
+device = 0 if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
+    task="automatic-speech-recognition",
+    model=MODEL_NAME,
+    chunk_length_s=30,
     device=device,
 )
+def transcribe(audio_file_path, task="transcribe"):
+    if not os.path.exists(audio_file_path):
+        print(f"Error: The file '{audio_file_path}' does not exist.")
+        return
+    try:
+        text = pipe(audio_file_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
+        return text
+    except Exception as e:
+        print(f"Error during transcription: {str(e)}")
+        return None
 if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python script.py <audio_file_path> [task]")
+        print("task can be 'transcribe' or 'translate' (default is 'transcribe')")
+        sys.exit(1)
+    audio_file_path = sys.argv[1]
+    task = sys.argv[2] if len(sys.argv) > 2 else "transcribe"
+    if task not in ["transcribe", "translate"]:
+        print("Error: task must be either 'transcribe' or 'translate'")
         sys.exit(1)
+    result = transcribe(audio_file_path, task)
+    if result:
+        print("Transcription result:")
+        print(result)