Spaces:

Ionut-Bostan
/

Emotion_Aware_TTS

Running

Ionut-Bostan commited on May 4, 2023

Commit

d197937

1 Parent(s): c6ab084

allowing model to synthesize samples using the CPU

Files changed (6) hide show

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

app.py CHANGED Viewed

@@ -1,7 +1,20 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()

 import gradio as gr
+import subprocess
+predefined_texts = ["Example text 1", "Example text 2", "Example text 3"]
+def synthesize_speech(text, speaker_id):
+    command = f"python3 synthesize.py --text '{text}' --bert_embed 1 --speaker_id {speaker_id} --restore_step 900000 --mode single -p config/EmoV_DB/preprocess.yaml -m config/EmoV_DB/model.yaml -t config/EmoV_DB/train.yaml"
+    output = subprocess.check_output(command, shell=True)
+    # Replace this with the path of the generated audio file
+    audio_file = 'output_file_path'
+    return audio_file
+iface = gr.Interface(fn=synthesize_speech,
+                     inputs=[gr.inputs.Dropdown(choices=predefined_texts, label="Select a text"),
+                             gr.inputs.Slider(minimum=0, maximum=10, step=1, default=0, label="Speaker ID")],
+                     outputs=gr.outputs.Audio(type="file"),
+                     title="Text-to-Speech Demo")
 iface.launch()

output/.DS_Store CHANGED Viewed

Binary files a/output/.DS_Store and b/output/.DS_Store differ

output/ckpt/.DS_Store CHANGED Viewed

Binary files a/output/ckpt/.DS_Store and b/output/ckpt/.DS_Store differ

utils/.DS_Store CHANGED Viewed

Binary files a/utils/.DS_Store and b/utils/.DS_Store differ

utils/model.py CHANGED Viewed

@@ -17,8 +17,8 @@ def get_model(args, configs, device, train=False):
             train_config["path"]["ckpt_path"],
             "{}.pth.tar".format(args.restore_step),
         )
-        ckpt = torch.load(ckpt_path)
-        model.load_state_dict(ckpt["model"], strict=False)
     if train:
         scheduled_optim = ScheduledOptim(
@@ -50,7 +50,7 @@ def get_vocoder(config, device):
             )
         elif speaker == "universal":
             vocoder = torch.hub.load(
-                "descriptinc/melgan-neurips", "load_melgan", "multi_speaker"
             )
         vocoder.mel2wav.eval()
         vocoder.mel2wav.to(device)
@@ -60,9 +60,9 @@ def get_vocoder(config, device):
         config = hifigan.AttrDict(config)
         vocoder = hifigan.Generator(config)
         if speaker == "LJSpeech":
-            ckpt = torch.load("hifigan/generator_LJSpeech.pth.tar")
         elif speaker == "universal":
-            ckpt = torch.load("hifigan/generator_universal.pth.tar")
         vocoder.load_state_dict(ckpt["generator"])
         vocoder.eval()
         vocoder.remove_weight_norm()

             train_config["path"]["ckpt_path"],
             "{}.pth.tar".format(args.restore_step),
         )
+        ckpt = torch.load(ckpt_path, map_location=device)
+        model.load_state_dict(ckpt["model"])
     if train:
         scheduled_optim = ScheduledOptim(
             )
         elif speaker == "universal":
             vocoder = torch.hub.load(
+                "descriptinc/melgan-neurips", "load_melgan", "multi_speaker",map_location=device
             )
         vocoder.mel2wav.eval()
         vocoder.mel2wav.to(device)
         config = hifigan.AttrDict(config)
         vocoder = hifigan.Generator(config)
         if speaker == "LJSpeech":
+            ckpt = torch.load("hifigan/generator_LJSpeech.pth.tar",map_location=device)
         elif speaker == "universal":
+            ckpt = torch.load("hifigan/generator_universal.pth.tar",map_location=device)
         vocoder.load_state_dict(ckpt["generator"])
         vocoder.eval()
         vocoder.remove_weight_norm()