Spaces:
Sleeping
Sleeping
File size: 2,545 Bytes
521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 c9a70b0 521aa78 89fc18c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
import torch
import librosa
import numpy as np
import os
# Load Facebook MMS Language Identification Model
MODEL_ID = "facebook/mms-lid-1024"
processor = AutoFeatureExtractor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_ID)
# Constants
LID_SAMPLING_RATE = 16_000
LID_THRESHOLD = 0.33 # Confidence threshold
LID_LANGUAGES = {}
# Load Language Labels
LANG_FILE = "data/lid/all_langs.tsv"
if not os.path.exists(LANG_FILE):
raise FileNotFoundError(f"Language file '{LANG_FILE}' not found!")
with open(LANG_FILE, encoding="utf-8") as f:
for line in f:
iso, name = line.strip().split(" ", 1)
LID_LANGUAGES[iso] = name
# Identify Audio Language
def identify(audio_data=None):
if not audio_data:
return "<<ERROR: Empty Audio Input>>"
# Microphone Input
if isinstance(audio_data, tuple):
sr, audio_samples = audio_data
audio_samples = (audio_samples / 32768.0).astype(np.float32)
if sr != LID_SAMPLING_RATE:
audio_samples = librosa.resample(audio_samples, orig_sr=sr, target_sr=LID_SAMPLING_RATE)
# File Upload
elif isinstance(audio_data, str):
if not os.path.exists(audio_data):
return f"<<ERROR: File '{audio_data}' not found!>>"
audio_samples, _ = librosa.load(audio_data, sr=LID_SAMPLING_RATE, mono=True)
else:
return "<<ERROR: Invalid Audio Input>>"
# Process Input
inputs = processor(audio_samples, sampling_rate=LID_SAMPLING_RATE, return_tensors="pt")
# Select Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = inputs.to(device)
# Predict Language
with torch.no_grad():
logit = model(**inputs).logits
# Compute Probabilities
logit_lsm = torch.log_softmax(logit.squeeze(), dim=-1)
scores, indices = torch.topk(logit_lsm, 5, dim=-1)
scores, indices = torch.exp(scores).cpu().tolist(), indices.cpu().tolist()
# Map to Language Labels
iso2score = {model.config.id2label[int(i)]: s for s, i in zip(scores, indices)}
# Confidence Check
if max(iso2score.values()) < LID_THRESHOLD:
return "Low confidence in language detection. No output shown."
return {LID_LANGUAGES.get(iso, iso): score for iso, score in iso2score.items()}
# Example Usage
LID_EXAMPLES = [
["upload/english.mp3"],
["upload/tamil.mp3"],
["upload/burmese.mp3"],
]
|