File size: 3,540 Bytes
5d24988
 
 
 
 
218c550
5d24988
 
 
 
 
 
218c550
5d24988
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218c550
 
 
 
5d24988
 
218c550
3406513
 
 
 
 
 
 
 
5d24988
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8c1cd0
3406513
 
5d24988
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
import torch
from datasets import load_dataset
from transformers import pipeline, SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech

model_id = "Sandiago21/speecht5_finetuned_google_fleurs_greek"  # update with your model id
# pipe = pipeline("automatic-speech-recognition", model=model_id)
model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)

processor = SpeechT5Processor.from_pretrained(model_id)

replacements = [
    ("ου", "u"),
    ("αυ", "af"),
    ("ευ", "ef"),
    ("ει", "i"),
    ("οι", "i"),
    ("αι", "e"),
    ("ού", "u"),
    ("εί", "i"),
    ("οί", "i"),
    ("αί", "e"),
    ("Ά", "A"),
    ("Έ", "E"),
    ("Ή", "H"),
    ("Ί", "I"),
    ("Ό", "O"),
    ("Ύ", "Y"),
    ("Ώ", "O"),
    ("ΐ", "i"),
    ("Α", "A"),
    ("Β", "B"),
    ("Γ", "G"),
    ("Δ", "L"),
    ("Ε", "Ε"),
    ("Ζ", "Z"),
    ("Η", "I"),
    ("Θ", "Th"),
    ("Ι", "I"),
    ("Κ", "K"),
    ("Λ", "L"),
    ("Μ", "M"),
    ("Ν", "N"),
    ("Ξ", "Ks"),
    ("Ο", "O"),
    ("Π", "P"),
    ("Ρ", "R"),
    ("Σ", "S"),
    ("Τ", "T"),
    ("Υ", "Y"),
    ("Φ", "F"),
    ("Χ", "X"),
    ("Ω", "O"),
    ("ά", "a"),
    ("έ", "e"),
    ("ή", "i"),
    ("ί", "i"),
    ("α", "a"),
    ("β", "v"),
    ("γ", "g"),
    ("δ", "d"),
    ("ε", "e"),
    ("ζ", "z"),
    ("η", "i"),
    ("θ", "th"),
    ("ι", "i"),
    ("κ", "k"),
    ("λ", "l"),
    ("μ", "m"),
    ("ν", "n"),
    ("ξ", "ks"),
    ("ο", "o"),
    ("π", "p"),
    ("ρ", "r"),
    ("ς", "s"),
    ("σ", "s"),
    ("τ", "t"),
    ("υ", "i"),
    ("φ", "f"),
    ("χ", "h"),
    ("ψ", "ps"),
    ("ω", "o"),
    ("ϊ", "i"),
    ("ϋ", "i"),
    ("ό", "o"),
    ("ύ", "i"),
    ("ώ", "o"),
    ("í", "i"),
    ("õ", "o"),
    ("Ε", "E"),
    ("Ψ", "Ps"),
]


title = "Text-to-Speech"
description = """
Demo for text-to-speech translation in Greek. Demo uses [Sandiago21/speecht5_finetuned_google_fleurs_greek](https://huggingface.co/Sandiago21/speecht5_finetuned_google_fleurs_greek) checkpoint, which is based on Microsoft's
[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model and is fine-tuned in Greek Audio dataset
![Text-to-Speech (TTS)"](https://geekflare.com/wp-content/uploads/2021/07/texttospeech-1200x385.png "Diagram of Text-to-Speech (TTS)")
"""


def cleanup_text(text):
    for src, dst in replacements:
        text = text.replace(src, dst)
    return text

def synthesize_speech(text):
    text = cleanup_text(text)
    inputs = processor(text=text, return_tensors="pt")

    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    return gr.Audio.update(value=(16000, speech.cpu().numpy()))

syntesize_speech_gradio = gr.Interface(
    synthesize_speech,
    inputs = gr.Textbox(label="Text", placeholder="Type something here..."),
    outputs=gr.Audio(),
    examples=["Έλαβαν χώρα μεγάλες διαδηλώσεις στην Πολωνία όταν εκείνη η χώρα υπέγραψε την acta που οδήγησε την κυβέρνηση της πολωνίας να αποφασίσει τη μη επικύρωση της συμφωνίας προς το παρόν"],
    title=title,
    description=description,
).launch()