Change transcription to fp16, and read if vocals separation is made into another script, if yes it doesn't separe vocals, if no it separe vocals
Browse files- transcribe.py +9 -8
transcribe.py
CHANGED
@@ -20,15 +20,15 @@ for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
|
|
20 |
"translator": language_code
|
21 |
}
|
22 |
|
23 |
-
def transcribe(audio_file, language, device):
|
24 |
output_folder = "transcriptions"
|
25 |
|
26 |
# Transcribe audio file
|
27 |
model = "large-v2"
|
28 |
# word_timestamps = True
|
29 |
print_progress = True
|
30 |
-
compute_type = "
|
31 |
-
fp16 =
|
32 |
batch_size = 8
|
33 |
verbose = False
|
34 |
min_speakers = 1
|
@@ -38,9 +38,9 @@ def transcribe(audio_file, language, device):
|
|
38 |
hf_token = "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"
|
39 |
command = f'whisperx {audio_file} --model {model} --batch_size {batch_size} --compute_type {compute_type} \
|
40 |
--output_dir {output_folder} --output_format {output_format} --verbose {verbose} --language {language} \
|
41 |
-
--fp16 {fp16} --threads {threads} --print_progress {print_progress} --
|
42 |
-
|
43 |
-
|
44 |
os.system(command)
|
45 |
|
46 |
if __name__ == "__main__":
|
@@ -49,6 +49,7 @@ if __name__ == "__main__":
|
|
49 |
parser.add_argument('language', help='Language of the audio file')
|
50 |
parser.add_argument('speakers_file', help='File with the number of speakers')
|
51 |
parser.add_argument('device', help='Device to use for PyTorch inference')
|
|
|
52 |
args = parser.parse_args()
|
53 |
|
54 |
vocals_folder = "vocals"
|
@@ -66,8 +67,8 @@ if __name__ == "__main__":
|
|
66 |
extension = "wav"
|
67 |
for i in range(speakers):
|
68 |
file = f'{vocals_folder}/{input_name}_speaker{i:003d}.{extension}'
|
69 |
-
transcribe(file, language_dict[args.language]["transcriber"], args.device)
|
70 |
else:
|
71 |
extension = "mp3"
|
72 |
file = f'{vocals_folder}/{input_name}.{extension}'
|
73 |
-
transcribe(file, language_dict[args.language]["transcriber"], args.device)
|
|
|
20 |
"translator": language_code
|
21 |
}
|
22 |
|
23 |
+
def transcribe(audio_file, language, device, vocals):
|
24 |
output_folder = "transcriptions"
|
25 |
|
26 |
# Transcribe audio file
|
27 |
model = "large-v2"
|
28 |
# word_timestamps = True
|
29 |
print_progress = True
|
30 |
+
compute_type = "float16"
|
31 |
+
fp16 = True
|
32 |
batch_size = 8
|
33 |
verbose = False
|
34 |
min_speakers = 1
|
|
|
38 |
hf_token = "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"
|
39 |
command = f'whisperx {audio_file} --model {model} --batch_size {batch_size} --compute_type {compute_type} \
|
40 |
--output_dir {output_folder} --output_format {output_format} --verbose {verbose} --language {language} \
|
41 |
+
--fp16 {fp16} --threads {threads} --print_progress {print_progress} --device {device}'
|
42 |
+
if vocals:
|
43 |
+
command += f' --diarize --max_speakers {max_speakers} --min_speakers {min_speakers} --hf_token {hf_token}'
|
44 |
os.system(command)
|
45 |
|
46 |
if __name__ == "__main__":
|
|
|
49 |
parser.add_argument('language', help='Language of the audio file')
|
50 |
parser.add_argument('speakers_file', help='File with the number of speakers')
|
51 |
parser.add_argument('device', help='Device to use for PyTorch inference')
|
52 |
+
parser.add_argument('vocals', help='Vocals or not')
|
53 |
args = parser.parse_args()
|
54 |
|
55 |
vocals_folder = "vocals"
|
|
|
67 |
extension = "wav"
|
68 |
for i in range(speakers):
|
69 |
file = f'{vocals_folder}/{input_name}_speaker{i:003d}.{extension}'
|
70 |
+
transcribe(file, language_dict[args.language]["transcriber"], args.device, args.vocals)
|
71 |
else:
|
72 |
extension = "mp3"
|
73 |
file = f'{vocals_folder}/{input_name}.{extension}'
|
74 |
+
transcribe(file, language_dict[args.language]["transcriber"], args.device, args.vocals)
|