JackismyShephard commited on
Commit
90c5b6d
1 Parent(s): 1d82989

add basic application file

Browse files
Files changed (1) hide show
  1. app.py +113 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+
5
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
+
7
+ checkpoint_base = "microsoft/speecht5_tts"
8
+ checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
9
+ processor = SpeechT5Processor.from_pretrained(checkpoint_base)
10
+ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint_finetuned)
11
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
12
+
13
+ speaker_embeddings = {
14
+ "F23": "embeddings/female_23_vestjylland.npy",
15
+ "F24": "embeddings/female_24_storkoebenhavn.npy",
16
+ "F49": "embeddings/female_49_nordjylland.npy",
17
+ "M51": "embeddings/male_51_vest_sudsjaelland.npy",
18
+ "M18": "embeddings/male_18_vest_sydsjaelland.npy",
19
+ "M31": "embeddings/male_31_fyn.npy",
20
+ }
21
+
22
+
23
+ def predict(text, speaker):
24
+ if len(text.strip()) == 0:
25
+ return (16000, np.zeros(0))
26
+
27
+ text = replace_danish_letters(text)
28
+
29
+ inputs = processor(text=text, return_tensors="pt")
30
+
31
+ # limit input length
32
+ input_ids = inputs["input_ids"]
33
+ input_ids = input_ids[..., : model.config.max_text_positions]
34
+
35
+ speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
36
+
37
+ speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
38
+
39
+ speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
40
+
41
+ speech = speech.numpy()
42
+ return (16000, speech)
43
+
44
+
45
+ def replace_danish_letters(text):
46
+ for src, dst in replacements:
47
+ text = text.replace(src, dst)
48
+ return text
49
+
50
+
51
+ replacements = [
52
+ ("&", "og"),
53
+ ("\r", " "),
54
+ ("´", ""),
55
+ ("\\", ""),
56
+ ("¨", " "),
57
+ ("Å", "AA"),
58
+ ("Æ", "AE"),
59
+ ("É", "E"),
60
+ ("Ö", "OE"),
61
+ ("Ø", "OE"),
62
+ ("á", "a"),
63
+ ("ä", "ae"),
64
+ ("å", "aa"),
65
+ ("è", "e"),
66
+ ("î", "i"),
67
+ ("ô", "oe"),
68
+ ("ö", "oe"),
69
+ ("ø", "oe"),
70
+ ("ü", "y"),
71
+ ]
72
+
73
+ title = "Danish Speech Synthesis"
74
+
75
+ description = """
76
+ synthesize long-form danish speech from text with the click of a button! Demo uses the"
77
+ f" checkpoint [{checkpoint_finetuned}](https://huggingface.co/{checkpoint_finetuned}) and 🤗 Transformers to synthesize speech.
78
+ """
79
+
80
+ examples = [
81
+ [
82
+ "I sin oprindelige før-kristne form blev alferne sandsynligvis opfattet som en personificering af det land og den natur, der omgav menneskene, dvs. den opdyrkede jord, gården og de naturressourcer, som hørte dertil. De var guddommelige eller delvis guddommelige væsener, der besad magiske kræfter, som de brugte både til fordel og ulempe for menneskene."
83
+ ],
84
+ ]
85
+
86
+ demo = gr.Interface(
87
+ fn=predict,
88
+ inputs=[
89
+ gr.Textbox(label="Input Text"),
90
+ gr.Radio(
91
+ label="Speaker",
92
+ choices=[
93
+ "F23 (Female, 23, Vestjylland)",
94
+ "F24 (Female, 24, Storkoebenhavn)",
95
+ "F49 (Female, 49 Nordjylland)",
96
+ "M51 (Male. 51. Vest-sydsjaelland)",
97
+ "M18 (Male, 18, Vest-sysjaelland)",
98
+ "M31 (Male, 31, Fyn)",
99
+ ],
100
+ value="F23 (Female, 23, Vestjylland)",
101
+ ),
102
+ ],
103
+ outputs=[
104
+ gr.Audio(label="Generated Speech", type="numpy"),
105
+ ],
106
+ title=title,
107
+ description=description,
108
+ examples=examples,
109
+ cache_examples=True,
110
+ allow_flagging="never",
111
+ )
112
+
113
+ demo.launch()