Spaces:
Running
Running
anderbogia
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -6,16 +6,15 @@ os.system("pip install transformers==4.30.2") #Some interoperability issue with
|
|
6 |
os.system("pip install tokenizers fairseq")
|
7 |
os.system("pip install numpy==1.24") #NumPy 1.24 or less needed by Numba. Use 1.23, librosa still uses np.complex which was dropped in NumPy 1.24
|
8 |
#os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]")
|
9 |
-
os.system("pip install torch accelerate torchaudio datasets
|
10 |
os.system("pip install librosa==0.9.0")
|
11 |
|
12 |
|
13 |
import gradio as gr
|
14 |
-
from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor
|
15 |
from datasets import load_dataset, Audio, Dataset
|
16 |
import torch
|
17 |
import librosa #For converting audio sample rate to 16k
|
18 |
-
from easymms.models.tts import TTSModel #For TTS inference using EasyMMS
|
19 |
|
20 |
LANG = "dtp" #Change to tih for Timugon Murut or iba for Iban
|
21 |
model_id = "facebook/mms-1b-all"
|
@@ -27,6 +26,9 @@ model.load_adapter(LANG)
|
|
27 |
|
28 |
asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"
|
29 |
|
|
|
|
|
|
|
30 |
def preprocess(input): #Sets recording sampling rate to 16k and returns numpy ndarray from audio
|
31 |
speech, sample_rate = librosa.load(input)
|
32 |
speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
|
@@ -61,13 +63,15 @@ with gr.Blocks(theme = gr.themes.Soft()) as demo:
|
|
61 |
</div>
|
62 |
""")
|
63 |
|
64 |
-
tts = TTSModel(LANG)
|
65 |
|
66 |
-
def
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
71 |
|
72 |
with gr.Row():
|
73 |
with gr.Column(scale = 1):
|
@@ -89,6 +93,6 @@ with gr.Blocks(theme = gr.themes.Soft()) as demo:
|
|
89 |
input_text = gr.components.Textbox(label = "Ginarit", placeholder = "Potutakai suat nu hiti")
|
90 |
button2 = gr.Button("Poulayo'")
|
91 |
output_audio = gr.components.Audio(label = "Rolou pinoulai")
|
92 |
-
button2.click(
|
93 |
|
94 |
demo.launch(debug = True)
|
|
|
6 |
os.system("pip install tokenizers fairseq")
|
7 |
os.system("pip install numpy==1.24") #NumPy 1.24 or less needed by Numba. Use 1.23, librosa still uses np.complex which was dropped in NumPy 1.24
|
8 |
#os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]")
|
9 |
+
os.system("pip install torch accelerate torchaudio datasets")
|
10 |
os.system("pip install librosa==0.9.0")
|
11 |
|
12 |
|
13 |
import gradio as gr
|
14 |
+
from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer
|
15 |
from datasets import load_dataset, Audio, Dataset
|
16 |
import torch
|
17 |
import librosa #For converting audio sample rate to 16k
|
|
|
18 |
|
19 |
LANG = "dtp" #Change to tih for Timugon Murut or iba for Iban
|
20 |
model_id = "facebook/mms-1b-all"
|
|
|
26 |
|
27 |
asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"
|
28 |
|
29 |
+
model_tts = VitsModel.from_pretrained("facebook/mms-tts-dtp")
|
30 |
+
tokenizer_tts = AutoTokenizer.from_pretrained("facebook/mms-tts-dtp")
|
31 |
+
|
32 |
def preprocess(input): #Sets recording sampling rate to 16k and returns numpy ndarray from audio
|
33 |
speech, sample_rate = librosa.load(input)
|
34 |
speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
|
|
|
63 |
</div>
|
64 |
""")
|
65 |
|
|
|
66 |
|
67 |
+
def tts_run(input):
|
68 |
+
tokenizer(input, return_tensors="pt")
|
69 |
+
with torch.no_grad():
|
70 |
+
output = model(**inputs).waveform
|
71 |
+
|
72 |
+
gradio_tuple = [16000, output]
|
73 |
+
|
74 |
+
return gradio_tuple
|
75 |
|
76 |
with gr.Row():
|
77 |
with gr.Column(scale = 1):
|
|
|
93 |
input_text = gr.components.Textbox(label = "Ginarit", placeholder = "Potutakai suat nu hiti")
|
94 |
button2 = gr.Button("Poulayo'")
|
95 |
output_audio = gr.components.Audio(label = "Rolou pinoulai")
|
96 |
+
button2.click(tts_run, inputs = input_text, outputs = output_audio)
|
97 |
|
98 |
demo.launch(debug = True)
|