speech-to-speech-translation

Sleeping

App Files Files Community

speech-to-speech-translation / app.py

Annvasileva

Update app.py

bfddd1a 9 months ago

raw

history blame contribute delete

No virus

2.76 kB

	import gradio as gr
	import torch
	from transformers import pipeline, Wav2Vec2ForCTC, Wav2Vec2Processor, AutoProcessor, AutoModel


	device = "cuda" if torch.cuda.is_available() else "cpu"

	# модель для распознавания речи
	asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
	asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

	def transcribe(audio):
	input_values = asr_processor(audio, return_tensors="pt", padding=True, sampling_rate=16000).input_values.to(device)
	logits = asr_model(input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = asr_processor.batch_decode(predicted_ids)
	return transcription[0]

	# модель для перевода из английского на русский
	translator_model = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")

	# модель для синтеза русскоязычной речи
	tts_model = AutoProcessor.from_pretrained("suno/bark")
	tts_tokenizer = AutoModel.from_pretrained("suno/bark")

	def translate_to_russian(text):
	translated = translator_model.generate(**translator_tokenizer(text, return_tensors="pt", padding=True))
	translated_text = translator_tokenizer.decode(translated[0], skip_special_tokens=True)
	return translated_text

	def synthesize_russian(text):
	translated_text = translate_to_russian(text)
	speech = tts_model.generate(**tts_tokenizer(translated_text, return_tensors="pt"))
	return speech.to("cpu")

	def speech_to_speech_translation(audio):
	transcribed_text = transcribe(audio)
	russian_speech = synthesize_russian(transcribed_text)
	return russian_speech.numpy()

	title = "Speech-to-Speech Translation"
	description = "Код сначала использует модель facebook/wav2vec2-base-960h для распознавания речи на английском.Затем переводит текст на русский с помощью модели Helsinki-NLP/opus-mt-en-ru, и осуществляет синтез речи на русском на основе модели suno/bark"



	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title=title,
	description=description,
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="upload", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title=title,
	description=description,
	)

	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "File"])

	demo.launch()