k4d3 commited on
Commit
1f9dab2
1 Parent(s): fc8f518

whisper stuff

Browse files

Signed-off-by: Balazs Horvath <acsipont@gmail.com>

Files changed (2) hide show
  1. ogg2wav +38 -0
  2. whisper +32 -69
ogg2wav ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/zsh
2
+
3
+ # Function to convert ogg to wav
4
+ convert_ogg_to_wav() {
5
+ local input_file="$1"
6
+ local output_file="${input_file:r}.wav"
7
+ ffmpeg -i "$input_file" "$output_file"
8
+ echo "Converted: $input_file -> $output_file"
9
+ }
10
+
11
+ # Set the target directory
12
+ if [[ $# -eq 0 ]]; then
13
+ target_dir="."
14
+ else
15
+ target_dir="$1"
16
+ fi
17
+
18
+ # Check if the target directory exists
19
+ if [[ ! -d "$target_dir" ]]; then
20
+ echo "Error: Directory '$target_dir' does not exist."
21
+ exit 1
22
+ fi
23
+
24
+ # Find all .ogg files in the target directory and its subdirectories
25
+ ogg_files=($(find "$target_dir" -type f -name "*.ogg"))
26
+
27
+ # Check if any .ogg files were found
28
+ if [[ ${#ogg_files[@]} -eq 0 ]]; then
29
+ echo "No .ogg files found in '$target_dir' or its subdirectories."
30
+ exit 0
31
+ fi
32
+
33
+ # Convert each .ogg file to .wav
34
+ for file in "${ogg_files[@]}"; do
35
+ convert_ogg_to_wav "$file"
36
+ done
37
+
38
+ echo "Conversion complete."
whisper CHANGED
@@ -1,86 +1,49 @@
1
  #!/usr/bin/env python
2
  # -*- coding: utf-8 -*-
3
 
4
- """
5
- This script uses the Whisper large-v3-turbo model from OpenAI for automatic speech recognition (ASR).
6
- The model is finetuned for faster performance with a minor quality trade-off. It leverages the Hugging Face
7
- Transformers library to load the model and processor, and performs transcription on an input audio file.
8
-
9
- Whisper is a state-of-the-art model for ASR and speech translation, proposed in the paper "Robust Speech
10
- Recognition via Large-Scale Weak Supervision" by Alec Radford et al. from OpenAI. Trained on over 5 million
11
- hours of labeled data, Whisper demonstrates a strong ability to generalize to many datasets and domains in
12
- a zero-shot setting.
13
-
14
- The script performs the following steps:
15
- 1. Checks if a CUDA-enabled GPU is available and sets the appropriate device and data type.
16
- 2. Loads the Whisper large-v3-turbo model and processor from the Hugging Face Hub.
17
- 3. Initializes an ASR pipeline using the model and processor.
18
- 4. Defines a function `transcribe_audio` that takes an audio file path as input, performs transcription,
19
- and outputs the result to the terminal and a text file.
20
- 5. The script expects an audio file path as a command-line argument and calls the `transcribe_audio` function.
21
-
22
- Usage:
23
- whisper <audio_file>
24
-
25
- Dependencies:
26
- - torch
27
- - transformers
28
- - datasets
29
- - accelerate
30
-
31
- Example:
32
- whisper sample_audio.wav
33
- """
34
-
35
  import torch
36
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
37
  import sys
38
  import os
39
 
40
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
41
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
42
 
43
- model_id = "openai/whisper-large-v3-turbo"
44
-
45
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
46
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
47
- )
48
- model.to(device)
49
-
50
- processor = AutoProcessor.from_pretrained(model_id)
51
 
52
  pipe = pipeline(
53
- "automatic-speech-recognition",
54
- model=model,
55
- tokenizer=processor.tokenizer,
56
- feature_extractor=processor.feature_extractor,
57
- torch_dtype=torch_dtype,
58
  device=device,
59
  )
60
 
61
- def transcribe_audio(audio_path):
62
- # Load audio file
63
- audio = {"path": audio_path}
64
-
65
- # Perform transcription
66
- result = pipe(audio)
67
-
68
- # Get the base filename and directory
69
- base_filename = os.path.splitext(audio_path)[0]
70
- output_text_path = base_filename + ".txt"
71
-
72
- # Output the result to the terminal
73
- print(result["text"])
74
-
75
- # Save the result to a text file
76
- with open(output_text_path, "w") as f:
77
- f.write(result["text"])
78
 
79
  if __name__ == "__main__":
80
- if len(sys.argv) != 2:
81
- print("Usage: python script.py <audio_file>")
 
 
 
 
 
 
 
 
82
  sys.exit(1)
83
-
84
- audio_file = sys.argv[1]
85
- transcribe_audio(audio_file)
86
 
 
 
 
 
 
1
  #!/usr/bin/env python
2
  # -*- coding: utf-8 -*-
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import torch
5
+ from transformers import pipeline
6
  import sys
7
  import os
8
 
9
+ MODEL_NAME = "openai/whisper-large-v3-turbo"
10
+ BATCH_SIZE = 8
11
 
12
+ device = 0 if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
13
 
14
  pipe = pipeline(
15
+ task="automatic-speech-recognition",
16
+ model=MODEL_NAME,
17
+ chunk_length_s=30,
 
 
18
  device=device,
19
  )
20
 
21
+ def transcribe(audio_file_path, task="transcribe"):
22
+ if not os.path.exists(audio_file_path):
23
+ print(f"Error: The file '{audio_file_path}' does not exist.")
24
+ return
25
+
26
+ try:
27
+ text = pipe(audio_file_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
28
+ return text
29
+ except Exception as e:
30
+ print(f"Error during transcription: {str(e)}")
31
+ return None
 
 
 
 
 
 
32
 
33
  if __name__ == "__main__":
34
+ if len(sys.argv) < 2:
35
+ print("Usage: python script.py <audio_file_path> [task]")
36
+ print("task can be 'transcribe' or 'translate' (default is 'transcribe')")
37
+ sys.exit(1)
38
+
39
+ audio_file_path = sys.argv[1]
40
+ task = sys.argv[2] if len(sys.argv) > 2 else "transcribe"
41
+
42
+ if task not in ["transcribe", "translate"]:
43
+ print("Error: task must be either 'transcribe' or 'translate'")
44
  sys.exit(1)
 
 
 
45
 
46
+ result = transcribe(audio_file_path, task)
47
+ if result:
48
+ print("Transcription result:")
49
+ print(result)