Lightmourne commited on
Commit
7cccc79
1 Parent(s): 0c34011

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -67
app.py DELETED
@@ -1,67 +0,0 @@
1
- import gradio as gr
2
- import numpy as np
3
- import torch
4
- from datasets import load_dataset
5
-
6
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline, VitsModel, VitsTokenizer
7
-
8
-
9
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
-
11
- # load speech translation checkpoint
12
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
-
14
- # load text-to-speech checkpoint and speaker embeddings
15
- model = VitsModel.from_pretrained("Matthijs/mms-tts-fra")
16
- tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-fra")
17
-
18
-
19
- def translate(audio):
20
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "french"})
21
- return outputs["text"]
22
-
23
-
24
- def synthesise(text):
25
- inputs = tokenizer(text=text, return_tensors="pt")
26
- speech_output = model(inputs["input_ids"].to(device))
27
- speech = speech_output.audio[0]
28
- return speech.cpu()
29
-
30
-
31
- def speech_to_speech_translation(audio):
32
- translated_text = translate(audio)
33
- synthesised_speech = synthesise(translated_text)
34
- synthesised_speech = (synthesised_speech.detach().numpy() * 32767).astype(np.int16)
35
- return 16000, synthesised_speech
36
-
37
-
38
- title = "Cascaded STST"
39
- description = """
40
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
41
- [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
42
- ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
43
- """
44
-
45
- demo = gr.Blocks()
46
-
47
- mic_translate = gr.Interface(
48
- fn=speech_to_speech_translation,
49
- inputs=gr.Audio(source="microphone", type="filepath"),
50
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
51
- title=title,
52
- description=description,
53
- )
54
-
55
- file_translate = gr.Interface(
56
- fn=speech_to_speech_translation,
57
- inputs=gr.Audio(source="upload", type="filepath"),
58
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
59
- examples=[["./example.wav"]],
60
- title=title,
61
- description=description,
62
- )
63
-
64
- with demo:
65
- gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
66
-
67
- demo.launch()