Spaces:
Runtime error
Runtime error
import librosa | |
import torch | |
import torchaudio | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoTokenizer | |
from datasets import load_dataset | |
import numpy as np | |
import re | |
chars_to_ignore = [ | |
",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�", | |
"#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"', | |
"“", "%", "‘", "�", "–", "…", "_", "”", '“', '„' | |
] | |
chars_to_mapping = { | |
"\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ", | |
} | |
class SpeechRecognition: | |
def __init__(self): | |
print("init SpeechRecognition") | |
def load_model(self): | |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
#self.processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish") | |
#self.model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish").to(self.device) | |
self.processor = Wav2Vec2Processor.from_pretrained("patrickvonplaten/wav2vec2-common_voice-tr-demo") | |
self.model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-common_voice-tr-demo").to(self.device) | |
return self | |
def multiple_replace(self, text, chars_to_mapping): | |
pattern = "|".join(map(re.escape, chars_to_mapping.keys())) | |
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text)) | |
def remove_special_characters(self, text, chars_to_ignore_regex): | |
text = re.sub(chars_to_ignore_regex, '', text).lower() + " " | |
return text | |
def normalizer(self, batch, chars_to_ignore, chars_to_mapping): | |
chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]""" | |
text = batch["sentence"].lower().strip() | |
text = text.replace("\u0307", " ").strip() | |
text = self.multiple_replace(text, chars_to_mapping) | |
text = self.remove_special_characters(text, chars_to_ignore_regex) | |
batch["sentence"] = text | |
return batch | |
def speech_file_to_array_fn(self, batch): | |
speech_array, sampling_rate = torchaudio.load(batch["path"]) | |
speech_array = speech_array.squeeze().numpy() | |
speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000) | |
batch["speech"] = speech_array | |
return batch | |
def predict(self, batch): | |
features = self.processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) | |
input_values = features.input_values.to(self.device) | |
attention_mask = features.attention_mask.to(self.device) | |
with torch.no_grad(): | |
logits = self.model(input_values, attention_mask=attention_mask).logits | |
pred_ids = torch.argmax(logits, dim=-1) | |
batch["predicted"] = self.processor.batch_decode(pred_ids)[0] | |
return batch | |
def predict_audio_file(self, speech): | |
features = self.processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True) | |
input_values = features.input_values.to(self.device) | |
attention_mask = features.attention_mask.to(self.device) | |
with torch.no_grad(): | |
logits = self.model(input_values, attention_mask=attention_mask).logits | |
pred_ids = torch.argmax(logits, dim=-1) | |
transcriptions = self.processor.decode(pred_ids[0]) | |
return transcriptions | |
def load_speech_with_file(self, audio_file): | |
speech, rate = librosa.load(audio_file,sr=16000) | |
return speech, rate |