import gradio as gr import librosa from transformers import AutoFeatureExtractor, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline def load_and_fix_data(input_file, model_sampling_rate): speech, sample_rate = librosa.load(input_file) if len(speech.shape) > 1: speech = speech[:, 0] + speech[:, 1] if sample_rate != model_sampling_rate: speech = librosa.resample(speech, sample_rate, model_sampling_rate) return speech feature_extractor = AutoFeatureExtractor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-spanish") sampling_rate = feature_extractor.sampling_rate asr = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish") prefix = '' model_checkpoint = "hackathon-pln-es/es_text_neutralizer" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) def postproc(input_sentence, preds): try: preds = preds.replace('De el', 'Del').replace('de el', 'del').replace(' ', ' ') if preds[0].islower(): preds = preds.capitalize() preds = preds.replace(' . ', '. ').replace(' , ', ', ') # Nombres en mayusculas prev_letter = '' for word in input_sentence.split(' '): if word: if word[0].isupper(): if word.lower() in preds and word != input_sentence.split(' ')[0]: if prev_letter == '.': preds = preds.replace('. ' + word.lower() + ' ', '. ' + word + ' ') else: if word[-1] == '.': preds = preds.replace(word.lower(), word) else: preds = preds.replace(word.lower() + ' ', word + ' ') prev_letter = word[-1] preds = preds.strip() # quitar ultimo espacio except: pass return preds model_name = "es/mai/tacotron2-DDC" def predict_and_ctc_lm_decode(input_file, speaker_idx: str=None): speech = load_and_fix_data(input_file, sampling_rate) transcribed_text = asr(speech, chunk_length_s=5, stride_length_s=1) transcribed_text = transcribed_text["text"] inputs = tokenizer([prefix + transcribed_text], return_tensors="pt", padding=True) with torch.no_grad(): if first_generation: output_sequence = model.generate( input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), do_sample=False, # disable sampling to test if batching affects output ) else: output_sequence = model.generate( input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), do_sample=False, num_beams=2, repetition_penalty=2.5, # length_penalty=1.0, early_stopping=True# disable sampling to test if batching affects output ) preds = postproc(transcribed_text, preds=tokenizer.decode(output_sequence[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) if len(preds) > MAX_TXT_LEN: text = preds[:MAX_TXT_LEN] print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.") print(text, model_name) # download model model_path, config_path, model_item = manager.download_model(f"tts_models/{model_name}") vocoder_name: Optional[str] = model_item["default_vocoder"] # download vocoder vocoder_path = None vocoder_config_path = None if vocoder_name is not None: vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name) # init synthesizer synthesizer = Synthesizer( model_path, config_path, None, None, vocoder_path, vocoder_config_path, ) # synthesize if synthesizer is None: raise NameError("model not found") wavs = synthesizer.tts(preds, speaker_idx) # return output with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: synthesizer.save_wav(wavs, fp) return fp.name gr.Interface( predict_and_ctc_lm_decode, inputs=[ gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio") ], outputs=gr.outputs.Audio(label="Output"), examples=[["audio1.wav"], ["travel.wav"]], title="Generate-Gender-Neutralized-Audios", description = "This is a Gradio demo for generating gender neutralized audios. To use it, simply provide an audio input (via microphone or audio recording), which will then be transcribed and gender-neutralized using a pre-trained models. Finally, with the help of Coqui's TTS model, gender neutralised audio is generated.", #article="