Spaces:
Sleeping
Sleeping
import time | |
import torch | |
import librosa | |
import gradio as gr | |
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor | |
model_name = "Yehor/w2v-bert-2.0-uk-v2" | |
device = "cpu" | |
max_duration = 30 | |
asr_model = AutoModelForCTC.from_pretrained(model_name).to(device) | |
processor = Wav2Vec2BertProcessor.from_pretrained(model_name) | |
audio_samples = [ | |
"sample_1.wav", | |
"sample_2.wav", | |
"sample_3.wav", | |
"sample_4.wav", | |
"sample_5.wav", | |
"sample_6.wav", | |
] | |
description_head = """ | |
# Speech-to-Text for Ukrainian v2 | |
## Overview | |
This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2 model that solves | |
a Speech-to-Text task for the Ukrainian language. | |
""".strip() | |
description_foot = """ | |
## Community | |
- Join our Discord server - https://discord.gg/yVAjkBgmt4 - where we're talking about Data Science, | |
Machine Learning, Deep Learning, and Artificial Intelligence. | |
- Join our Speech Recognition Group in Telegram: https://t.me/speech_recognition_uk | |
## Authors | |
Yehor Smoliakov: https://github.com/egorsmkv on GitHub, and egorsmkv@gmail.com for private discussions. | |
""".strip() | |
def inference(audio_path, progress=gr.Progress()): | |
gr.Info("Starting process", duration=2) | |
progress(0, desc="Starting") | |
duration = librosa.get_duration(path=audio_path) | |
if duration > max_duration: | |
raise gr.Error("The duration of the file exceeds 10 seconds.") | |
paths = [ | |
audio_path, | |
] | |
results = [] | |
for path in progress.tqdm(paths, desc="Recognizing...", unit="file"): | |
t0 = time.time() | |
audio_duration = librosa.get_duration(path=path, sr=16_000) | |
audio_input, _ = librosa.load(path, mono=True, sr=16_000) | |
features = processor([audio_input], sampling_rate=16_000).input_features | |
features = torch.tensor(features).to(device) | |
with torch.inference_mode(): | |
logits = asr_model(features).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
predictions = processor.batch_decode(predicted_ids) | |
elapsed_time = round(time.time() - t0, 2) | |
rtf = round(elapsed_time / audio_duration, 4) | |
audio_duration = round(audio_duration, 2) | |
results.append( | |
{ | |
"path": path.split("/")[-1], | |
"transcription": "\n".join(predictions), | |
"audio_duration": audio_duration, | |
"rtf": rtf, | |
} | |
) | |
gr.Info("Finished...", duration=2) | |
result_texts = [] | |
for result in results: | |
result_texts.append(f'**{result["path"]}**') | |
result_texts.append("\n\n") | |
result_texts.append(f'> {result["transcription"]}') | |
result_texts.append("\n\n") | |
result_texts.append(f'**Audio duration**: {result["audio_duration"]}') | |
result_texts.append("\n") | |
result_texts.append(f'**Real-Time Factor**: {result["rtf"]}') | |
return "\n".join(result_texts) | |
demo = gr.Blocks( | |
title="Speech-to-Text for Ukrainian", | |
analytics_enabled=False, | |
) | |
with demo: | |
gr.Markdown(description_head) | |
gr.Markdown(f"## Demo (max. duration: **{max_duration}** seconds)") | |
with gr.Row(): | |
audio_file = gr.Audio(label="Audio file", type="filepath") | |
transcription = gr.Markdown( | |
label="Transcription", | |
value="Recognized text will appear here. Use **an example file** below the Recognize button," | |
"upload **your audio file**, or use **the microphone** to record something...", | |
) | |
gr.Button("Recognize").click(inference, inputs=audio_file, outputs=transcription) | |
with gr.Row(): | |
gr.Examples( | |
label="Choose an example audio", inputs=audio_file, examples=audio_samples | |
) | |
gr.Markdown(description_foot) | |
if __name__ == "__main__": | |
demo.launch() | |