Spaces:

facebook
/

XLS-R-300m-EN-15

Build error

File size: 3,130 Bytes

0566261
62a78c6
69502c9
62a78c6
8ace0d6
 
 
00349e4
 
0566261
62a78c6
 
 
 
 
69502c9
62a78c6
dd689bb
00349e4
3740191
e1e8cf6
 
00349e4
b393952
 
 
 
 
 
 
 
 
 
00349e4
62a78c6
dd689bb
e1e8cf6
dd689bb
00d34db
b393952
dd689bb
4eef8ac
e1e8cf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
797417d
62a78c6
3f864d3
797417d
337470e
 
4eef8ac
47991e5
62a78c6
fade36a
9e02a9f
0faef8c
81017ad
4eef8ac
5b8b578
 
62a78c6

import gradio as gr
import librosa
from transformers import AutoFeatureExtractor, AutoTokenizer, SpeechEncoderDecoderModel
    
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_fast=False)
model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")

feature_extractor = tokenizer = model = None

def process_audio_file(file):
    data, sr = librosa.load(file)
    if sr != 16000:
        data = librosa.resample(data, sr, 16000)
    print(data.shape)
    input_values = feature_extractor(data, return_tensors="pt").input_values
    return input_values
    
def transcribe(file_mic, file_upload, target_language):
    
    target_code = target_language.split("(")[-1].split(")")[0]
    forced_bos_token_id = MAPPING[target_code]
    
    warn_output = ""
    if (file_mic is not None) and (file_upload is not None):
       warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
       file = file_mic
    elif (file_mic is None) and (file_upload is None):
       return "ERROR: You have to either use the microphone or upload an audio file"
    elif file_mic is not None:
       file = file_mic
    else:
       file = file_upload
       
    input_values = process_audio_file(file)
    
    sequences = model.generate(input_values, forced_bos_token_id=forced_bos_token_id)
    
    transcription = tokenizer.batch_decode(sequences, skip_special_tokens=True)
    return warn_output + transcription[0]
    
target_language = [
    "German (de)",
    "Turkish (tr)",
    "Persian (fa)",
    "Swedish (sv)",
    "Mongolian (mn)",
    "Chinese (zh)",
    "Welsh (cy)",
    "Catalan (ca)",
    "Slovenian (sl)",
    "Estonian (et)",
    "Indonesian (id)",
    "Arabic (ar)",
    "Tamil (ta)",
    "Latvian (lv)",
    "Japanese (ja)",
]

MAPPING = {
    "de": 250003,
    "tr": 250023,
    "fa": 250029,
    "sv": 250042,
    "mn": 250037,
    "zh": 250025,
    "cy": 250007,
    "ca": 250005,
    "sl": 250052,
    "et": 250006,
    "id": 250032,
    "ar": 250001,
    "ta": 250044,
    "lv": 250017,
    "ja": 250012,
}
    
iface = gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.inputs.Audio(source="microphone", type='filepath', optional=True),
        gr.inputs.Audio(source="upload", type='filpath', optional=True),
        gr.inputs.Dropdown(target_language),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    article = "<p style='text-align: center'><a href='https://huggingface.co/facebook/wav2vec2-xls-r-300m-en-to-15' target='_blank'>Click to learn more about XLS-R-300M-EN-15 </a> | <a href='https://arxiv.org/abs/2111.09296' target='_blank'> With 🎙️ from Facebook XLS-R </a></p>",
    title="XLS-R 300M EN-to-15 Speech Translation",
    description="A simple interface to translate English Speech to 15 possible languages.",
    enable_queue=True,
    allow_flagging=False,
)
iface.launch()