Oysiyl commited on
Commit
0d2eaf0
1 Parent(s): f521c52

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -0
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Text
3
+ import gradio as gr
4
+ import soundfile as sf
5
+ from transformers import pipeline
6
+ import numpy as np
7
+ import torch
8
+ import re
9
+ from speechbrain.pretrained import EncoderClassifier
10
+
11
+
12
+ def create_speaker_embedding(speaker_model, waveform: np.ndarray) -> np.ndarray:
13
+ with torch.no_grad():
14
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
15
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
16
+ if device.type != 'cuda':
17
+ speaker_embeddings = speaker_embeddings.squeeze().numpy()
18
+ else:
19
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
20
+ speaker_embeddings = torch.tensor(speaker_embeddings, dtype=dtype).unsqueeze(0).to(device)
21
+ return speaker_embeddings
22
+
23
+
24
+ def remove_special_characters_s(text: Text) -> Text:
25
+ chars_to_remove_regex = '[\…\–\"\“\%\‘\”\�\»\«\„\`\'́]'
26
+ # remove special characters
27
+ text = re.sub(chars_to_remove_regex, '', text)
28
+ text = re.sub("՚", "'", text)
29
+ text = re.sub("’", "'", text)
30
+ text = re.sub(r'ы', 'и', text)
31
+ text = text.lower()
32
+ return text
33
+
34
+
35
+ def cyrillic_to_latin(text: Text) -> Text:
36
+ replacements = [
37
+ ('а', 'a'),
38
+ ('б', 'b'),
39
+ ('в', 'v'),
40
+ ('г', 'h'),
41
+ ('д', 'd'),
42
+ ('е', 'e'),
43
+ ('ж', 'zh'),
44
+ ('з', 'z'),
45
+ ('и', 'y'),
46
+ ('й', 'j'),
47
+ ('к', 'k'),
48
+ ('л', 'l'),
49
+ ('м', 'm'),
50
+ ('н', 'n'),
51
+ ('о', 'o'),
52
+ ('п', 'p'),
53
+ ('р', 'r'),
54
+ ('с', 's'),
55
+ ('т', 't'),
56
+ ('у', 'u'),
57
+ ('ф', 'f'),
58
+ ('х', 'h'),
59
+ ('ц', 'ts'),
60
+ ('ч', 'ch'),
61
+ ('ш', 'sh'),
62
+ ('щ', 'sch'),
63
+ ('ь', "'"),
64
+ ('ю', 'ju'),
65
+ ('я', 'ja'),
66
+ ('є', 'je'),
67
+ ('і', 'i'),
68
+ ('ї', 'ji'),
69
+ ('ґ', 'g')
70
+ ]
71
+
72
+ for src, dst in replacements:
73
+ text = text.replace(src, dst)
74
+ return text
75
+
76
+
77
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
78
+
79
+ dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
80
+
81
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
82
+
83
+ speaker_model = EncoderClassifier.from_hparams(
84
+ source=spk_model_name,
85
+ run_opts={"device": device},
86
+ savedir=os.path.join("/tmp", spk_model_name)
87
+ )
88
+
89
+ waveform, samplerate = sf.read("speaker.wav")
90
+
91
+ speaker_embeddings = create_speaker_embedding(speaker_model, waveform)
92
+
93
+ transcriber = pipeline("text-to-speech", model="Oysiyl/speecht5_tts_common_voice_uk")
94
+
95
+ def transcribe(text: Text) -> tuple((int, np.ndarray)):
96
+ text = remove_special_characters_s(text)
97
+ text = cyrillic_to_latin(text)
98
+ out = transcriber(text, forward_params={"speaker_embeddings": speaker_embeddings})
99
+ audio, sr = out["audio"], out["sampling_rate"]
100
+ return sr, audio
101
+
102
+
103
+ demo = gr.Interface(
104
+ transcribe,
105
+ gr.Textbox(),
106
+ outputs="audio",
107
+ title="Text to Speech for Ukrainian language demo",
108
+ description="Click on the example below or type text!",
109
+ examples=[["Держава-агресор Росія закуповує комунікаційне обладнання, зокрема супутникові інтернет-термінали Starlink, для використання у війні в арабських країнах"],
110
+ ["Доброго вечора, ми з України!"]],
111
+ cache_examples=True
112
+ )
113
+
114
+ demo.launch()