Abigail commited on
Commit
5817c5e
1 Parent(s): 1f2d661

first commit tts and stt with multiple stt possibilities

Browse files
Files changed (2) hide show
  1. .DS_Store +0 -0
  2. stttotts.py +177 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
stttotts.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """sttToTts.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/15QqRKFSwfhRdnaj5-R1z6xFfeEOOta38
8
+ """
9
+
10
+ #text-to-speech and speech to text
11
+ !pip install TTS
12
+ !pip install transformers
13
+
14
+ #text to speech
15
+ from TTS.api import TTS
16
+ tts = TTS("tts_models/multilingual/multi-dataset/your_tts", cs_api_model = "TTS.cs_api.CS_API", gpu=True)
17
+
18
+ #voice recording
19
+ import IPython.display
20
+ import google.colab.output
21
+ import base64
22
+ # all imports for voice recording
23
+ from IPython.display import Javascript
24
+ from google.colab import output
25
+ from base64 import b64decode
26
+
27
+ #to record sound, found on https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be
28
+
29
+ RECORD = """
30
+ const sleep = time => new Promise(resolve => setTimeout(resolve, time))
31
+ const b2text = blob => new Promise(resolve => {
32
+ const reader = new FileReader()
33
+ reader.onloadend = e => resolve(e.srcElement.result)
34
+ reader.readAsDataURL(blob)
35
+ })
36
+ var record = time => new Promise(async resolve => {
37
+ stream = await navigator.mediaDevices.getUserMedia({ audio: true })
38
+ recorder = new MediaRecorder(stream)
39
+ chunks = []
40
+ recorder.ondataavailable = e => chunks.push(e.data)
41
+ recorder.start()
42
+ await sleep(time)
43
+ recorder.onstop = async ()=>{
44
+ blob = new Blob(chunks)
45
+ text = await b2text(blob)
46
+ resolve(text)
47
+ }
48
+ recorder.stop()
49
+ })
50
+ """
51
+
52
+ def record(name, sec):
53
+ display(Javascript(RECORD))
54
+ s = output.eval_js('record(%d)' % (sec*1000))
55
+ b = b64decode(s.split(',')[1])
56
+ with open(f'{name}.webm','wb') as f:
57
+ f.write(b)
58
+ return (f'{name}.webm') # or webm ?
59
+
60
+ #to record the text which is going to be transcribed
61
+ record('audio', sec = 10)
62
+
63
+ #works -- speech-to-text with an audio I provide the path to reach
64
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
65
+ import librosa
66
+
67
+ # load model and processor
68
+ processor = WhisperProcessor.from_pretrained("openai/whisper-small")
69
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
70
+ model.config.forced_decoder_ids = None
71
+
72
+ # load audio from a specific path
73
+ audio_path = "audio.webm"
74
+ audio_array, sampling_rate = librosa.load(audio_path, sr=16000) # "sr=16000" ensures that the sampling rate is as required
75
+
76
+
77
+ # process the audio array
78
+ input_features = processor(audio_array, sampling_rate, return_tensors="pt").input_features
79
+
80
+
81
+ predicted_ids = model.generate(input_features)
82
+
83
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
84
+ print(transcription)
85
+
86
+ #to record the speaker's voice used for tts
87
+ record('speaker', sec = 10 )
88
+
89
+ #library to convert digits to words (ex : 1 --> one)
90
+ import locale
91
+ locale.getpreferredencoding = lambda: "UTF-8"
92
+ !pip install inflect
93
+
94
+ import re
95
+ import inflect
96
+ #because numbers under digit format are ignored otherwise
97
+ def convert_numbers_to_words(s):
98
+ p = inflect.engine()
99
+ # Find all sequences of digits in the string
100
+ numbers = re.findall(r'\d+', s)
101
+ for number in numbers:
102
+ # Convert each number to words
103
+ words = p.number_to_words(number)
104
+ # Replace the original number in the string with its word representation
105
+ s = s.replace(number, words)
106
+ return s
107
+
108
+ #model test 1 for text to speech
109
+ #works - text to speech with voice cloner (by providing the path to the audio where the voice is)
110
+ from google.colab import drive
111
+ from IPython.display import Audio
112
+
113
+
114
+
115
+ tts.tts_to_file(text=convert_numbers_to_words(str(transcription)),
116
+ file_path="output.wav",
117
+ speaker_wav='speaker.webm',
118
+ language="en",
119
+ emotion ='angry',
120
+ speed = 2)
121
+ audio_path = "output.wav"
122
+ Audio(audio_path)
123
+
124
+ #model test 2 for text to speech
125
+ from IPython.display import Audio
126
+ # TTS with on the fly voice conversion
127
+ api = TTS("tts_models/deu/fairseq/vits")
128
+ api.tts_with_vc_to_file(
129
+ text="Wie sage ich auf Italienisch, dass ich dich liebe?",
130
+ speaker_wav="speaker.webm",
131
+ file_path="ouptut.wav"
132
+ )
133
+ audio_path = "output.wav"
134
+ Audio(audio_path)
135
+
136
+ #model test 3 for text to speech
137
+ from TTS.api import TTS
138
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True)
139
+
140
+ from IPython.display import Audio
141
+
142
+
143
+ # generate speech by cloning a voice using custom settings
144
+ tts.tts_to_file(text="But for me to rap like a computer it must be in my genes I got a laptop in my back pocket My pen'll go off when I half-cock it Got a fat knot from that rap profit Made a livin' and a killin' off it Ever since Bill Clinton was still in office with Monica Lewinsky feelin' on his nutsack I'm an MC still as honest",
145
+ file_path="output.wav",
146
+ speaker_wav="Slide 1.m4a",
147
+ language="en",
148
+ emotion = "neutral",
149
+ decoder_iterations=35)
150
+
151
+ audio_path = "output.wav"
152
+ Audio(audio_path)
153
+
154
+ # Init TTS with the target studio speaker
155
+ tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
156
+ # Run TTS
157
+ tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
158
+ # Run TTS with emotion and speed control
159
+ tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
160
+
161
+ #model test 4 for text to speech
162
+ from IPython.display import Audio
163
+
164
+ from TTS.api import TTS
165
+ #api = TTS(model_name="tts_models/eng/fairseq/vits").to("cuda")
166
+ #api.tts_to_file("This is a test.", file_path="output.wav")
167
+
168
+ # TTS with on the fly voice conversion
169
+ api = TTS("tts_models/deu/fairseq/vits")
170
+ api.tts_with_vc_to_file(
171
+ "I am a basic human",
172
+ speaker_wav="speaker.webm",
173
+ file_path="output.wav"
174
+ )
175
+
176
+ audio_path = "output.wav"
177
+ Audio(audio_path)