Voice_Assistant_TTS_long

Sleeping

App Files Files Community

Siddhant commited on Sep 6

Commit

f7f39bd

•

1 Parent(s): 5e6b5bb

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -83

app.py CHANGED Viewed

@@ -1,105 +1,130 @@
 from transformers import pipeline
-import torch
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-classifier = pipeline(
-    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
-)
-from transformers.pipelines.audio_utils import ffmpeg_microphone_live
-def launch_fn(
-    wake_word="marvin",
-    prob_threshold=0.5,
-    chunk_length_s=2.0,
-    stream_chunk_s=0.25,
-    debug=False,
-):
-    if wake_word not in classifier.model.config.label2id.keys():
-        raise ValueError(
-            f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
-        )
-    sampling_rate = classifier.feature_extractor.sampling_rate
-    mic = ffmpeg_microphone_live(
-        sampling_rate=sampling_rate,
-        chunk_length_s=chunk_length_s,
-        stream_chunk_s=stream_chunk_s,
-    )
-    print("Listening for wake word...")
-    for prediction in classifier(mic):
-        prediction = prediction[0]
-        if debug:
-            print(prediction)
-        if prediction["label"] == wake_word:
-            if prediction["score"] > prob_threshold:
-                return True
-transcriber = pipeline(
-    "automatic-speech-recognition", model="openai/whisper-base.en", device=device
-)
-import sys
-def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
-    sampling_rate = transcriber.feature_extractor.sampling_rate
-    mic = ffmpeg_microphone_live(
-        sampling_rate=sampling_rate,
-        chunk_length_s=chunk_length_s,
-        stream_chunk_s=stream_chunk_s,
-    )
-    print("Start speaking...")
-    for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
-        sys.stdout.write("\033[K")
-        print(item["text"], end="\r")
-        if not item["partial"][0]:
-            break
-    return item["text"]
-from huggingface_hub import HfFolder
-import requests
-def query(text, model_id="tiiuae/falcon-7b-instruct"):
-    api_url = f"https://api-inference.huggingface.co/models/{model_id}"
-    headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
-    payload = {"inputs": text}
-    print(f"Querying...: {text}")
-    response = requests.post(api_url, headers=headers, json=payload)
-    return response.json()[0]["generated_text"][len(text) + 1 :]
-from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
-from datasets import load_dataset
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-def synthesise(text):
-    inputs = processor(text=text, return_tensors="pt")
-    speech = model.generate_speech(
-        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
-    )
-    return speech.cpu()
-if __name__ == "__main__":
-    launch_fn(debug=True)
-    # transcription = transcribe()
-    # response = query(transcription)
-    # audio = synthesise(response)
-    # Audio(audio, rate=16000, autoplay=True)

+import gradio as gr
 from transformers import pipeline
+import numpy as np
+transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
+def transcribe(stream, new_chunk):
+    sr, y = new_chunk
+    y = y.astype(np.float32)
+    y /= np.max(np.abs(y))
+    if stream is not None:
+        stream = np.concatenate([stream, y])
+    else:
+        stream = y
+    return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]
+demo = gr.Interface(
+    transcribe,
+    ["state", gr.Audio(sources=["microphone"], streaming=True)],
+    ["state", "text"],
+    live=True,
+)
+demo.launch()
+# from transformers import pipeline
+# import torch
+# device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# classifier = pipeline(
+#     "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
+# )
+# from transformers.pipelines.audio_utils import ffmpeg_microphone_live
+# def launch_fn(
+#     wake_word="marvin",
+#     prob_threshold=0.5,
+#     chunk_length_s=2.0,
+#     stream_chunk_s=0.25,
+#     debug=False,
+# ):
+#     if wake_word not in classifier.model.config.label2id.keys():
+#         raise ValueError(
+#             f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
+#         )
+#     sampling_rate = classifier.feature_extractor.sampling_rate
+#     mic = ffmpeg_microphone_live(
+#         sampling_rate=sampling_rate,
+#         chunk_length_s=chunk_length_s,
+#         stream_chunk_s=stream_chunk_s,
+#     )
+#     print("Listening for wake word...")
+#     for prediction in classifier(mic):
+#         prediction = prediction[0]
+#         if debug:
+#             print(prediction)
+#         if prediction["label"] == wake_word:
+#             if prediction["score"] > prob_threshold:
+#                 return True
+# transcriber = pipeline(
+#     "automatic-speech-recognition", model="openai/whisper-base.en", device=device
+# )
+# import sys
+# def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
+#     sampling_rate = transcriber.feature_extractor.sampling_rate
+#     mic = ffmpeg_microphone_live(
+#         sampling_rate=sampling_rate,
+#         chunk_length_s=chunk_length_s,
+#         stream_chunk_s=stream_chunk_s,
+#     )
+#     print("Start speaking...")
+#     for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
+#         sys.stdout.write("\033[K")
+#         print(item["text"], end="\r")
+#         if not item["partial"][0]:
+#             break
+#     return item["text"]
+# from huggingface_hub import HfFolder
+# import requests
+# def query(text, model_id="tiiuae/falcon-7b-instruct"):
+#     api_url = f"https://api-inference.huggingface.co/models/{model_id}"
+#     headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
+#     payload = {"inputs": text}
+#     print(f"Querying...: {text}")
+#     response = requests.post(api_url, headers=headers, json=payload)
+#     return response.json()[0]["generated_text"][len(text) + 1 :]
+# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
+# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+# from datasets import load_dataset
+# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+# speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+# def synthesise(text):
+#     inputs = processor(text=text, return_tensors="pt")
+#     speech = model.generate_speech(
+#         inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
+#     )
+#     return speech.cpu()
+# if __name__ == "__main__":
+#     launch_fn(debug=True)
+#     # transcription = transcribe()
+#     # response = query(transcription)
+#     # audio = synthesise(response)
+#     # Audio(audio, rate=16000, autoplay=True)