Spaces:

sasan
/

KITT

Build error

App Files Files Community

Abigail commited on Oct 27, 2023

Commit

5817c5e

1 Parent(s): 1f2d661

first commit tts and stt with multiple stt possibilities

Browse files

Files changed (2) hide show

.DS_Store +0 -0
stttotts.py +177 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

stttotts.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# -*- coding: utf-8 -*-
+"""sttToTts.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/15QqRKFSwfhRdnaj5-R1z6xFfeEOOta38
+"""
+#text-to-speech and speech to text
+!pip install TTS
+!pip install transformers
+#text to speech
+from TTS.api import TTS
+tts = TTS("tts_models/multilingual/multi-dataset/your_tts", cs_api_model  = "TTS.cs_api.CS_API", gpu=True)
+#voice recording
+import IPython.display
+import google.colab.output
+import base64
+# all imports for voice recording
+from IPython.display import Javascript
+from google.colab import output
+from base64 import b64decode
+#to record sound, found on https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be
+RECORD = """
+const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
+const b2text = blob => new Promise(resolve => {
+  const reader = new FileReader()
+  reader.onloadend = e => resolve(e.srcElement.result)
+  reader.readAsDataURL(blob)
+})
+var record = time => new Promise(async resolve => {
+  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
+  recorder = new MediaRecorder(stream)
+  chunks = []
+  recorder.ondataavailable = e => chunks.push(e.data)
+  recorder.start()
+  await sleep(time)
+  recorder.onstop = async ()=>{
+    blob = new Blob(chunks)
+    text = await b2text(blob)
+    resolve(text)
+  }
+  recorder.stop()
+})
+"""
+def record(name, sec):
+  display(Javascript(RECORD))
+  s = output.eval_js('record(%d)' % (sec*1000))
+  b = b64decode(s.split(',')[1])
+  with open(f'{name}.webm','wb') as f:
+    f.write(b)
+  return (f'{name}.webm')  # or webm ?
+#to record the text which is going to be transcribed
+record('audio', sec = 10)
+#works -- speech-to-text with an audio I provide the path to reach
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import librosa
+# load model and processor
+processor = WhisperProcessor.from_pretrained("openai/whisper-small")
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
+model.config.forced_decoder_ids = None
+# load audio from a specific path
+audio_path = "audio.webm"
+audio_array, sampling_rate = librosa.load(audio_path, sr=16000)  # "sr=16000" ensures that the sampling rate is as required
+# process the audio array
+input_features = processor(audio_array, sampling_rate, return_tensors="pt").input_features
+predicted_ids = model.generate(input_features)
+transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+print(transcription)
+#to record the speaker's voice used for tts
+record('speaker', sec = 10 )
+#library to convert digits to words (ex : 1 --> one)
+import locale
+locale.getpreferredencoding = lambda: "UTF-8"
+!pip install inflect
+import re
+import inflect
+#because numbers under digit format are ignored otherwise
+def convert_numbers_to_words(s):
+    p = inflect.engine()
+    # Find all sequences of digits in the string
+    numbers = re.findall(r'\d+', s)
+    for number in numbers:
+        # Convert each number to words
+        words = p.number_to_words(number)
+        # Replace the original number in the string with its word representation
+        s = s.replace(number, words)
+    return s
+#model test 1 for text to speech
+#works - text to speech with voice cloner (by providing the path to the audio where the voice is)
+from google.colab import drive
+from IPython.display import Audio
+tts.tts_to_file(text=convert_numbers_to_words(str(transcription)),
+                file_path="output.wav",
+                speaker_wav='speaker.webm',
+                language="en",
+                emotion ='angry',
+                speed = 2)
+audio_path = "output.wav"
+Audio(audio_path)
+#model test 2 for text to speech
+from IPython.display import Audio
+# TTS with on the fly voice conversion
+api = TTS("tts_models/deu/fairseq/vits")
+api.tts_with_vc_to_file(
+    text="Wie sage ich auf Italienisch, dass ich dich liebe?",
+    speaker_wav="speaker.webm",
+    file_path="ouptut.wav"
+)
+audio_path = "output.wav"
+Audio(audio_path)
+#model test 3 for text to speech
+from TTS.api import TTS
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True)
+from IPython.display import Audio
+# generate speech by cloning a voice using custom settings
+tts.tts_to_file(text="But for me to rap like a computer it must be in my genes I got a laptop in my back pocket My pen'll go off when I half-cock it Got a fat knot from that rap profit Made a livin' and a killin' off it Ever since Bill Clinton was still in office with Monica Lewinsky feelin' on his nutsack I'm an MC still as honest",
+                file_path="output.wav",
+                speaker_wav="Slide 1.m4a",
+                language="en",
+                emotion = "neutral",
+                decoder_iterations=35)
+audio_path = "output.wav"
+Audio(audio_path)
+# Init TTS with the target studio speaker
+tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
+# Run TTS
+tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
+# Run TTS with emotion and speed control
+tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
+#model test 4 for text to speech
+from IPython.display import Audio
+from TTS.api import TTS
+#api = TTS(model_name="tts_models/eng/fairseq/vits").to("cuda")
+#api.tts_to_file("This is a test.", file_path="output.wav")
+# TTS with on the fly voice conversion
+api = TTS("tts_models/deu/fairseq/vits")
+api.tts_with_vc_to_file(
+    "I am a basic human",
+    speaker_wav="speaker.webm",
+    file_path="output.wav"
+)
+audio_path = "output.wav"
+Audio(audio_path)