Voice_Assistant_TTS_long

Sleeping

File size: 4,100 Bytes

f7f39bd
3200ea6
f7f39bd
 
 
 
 
 
 
 
 
 
 
 
 
715b732
f7f39bd
 
 
 
715b732
f7f39bd
 
3200ea6
f7f39bd
 
 
3200ea6
f7f39bd
3200ea6
f7f39bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3200ea6
 
f7f39bd
 
3200ea6
f7f39bd
 
 
 
 
3200ea6
f7f39bd
 
 
 
 
 
3200ea6
f7f39bd
3200ea6
f7f39bd
 
3200ea6
 
f7f39bd
 
 
 
3200ea6
f7f39bd
 
 
3200ea6
f7f39bd
3200ea6
f7f39bd
3200ea6
f7f39bd
 
3200ea6
f7f39bd
3200ea6
f7f39bd
 
3200ea6
f7f39bd
 
 
 
 
 
96ff1c2
 
f7f39bd
 
 
 
 
3200ea6
f7f39bd

import gradio as gr
from transformers import pipeline
import numpy as np

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

def transcribe(stream, new_chunk):
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y
    return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"] , new_chunk 

demo = gr.Interface(
    transcribe,
    ["state", gr.Audio(sources=["microphone"], streaming=True)],
    ["state", "text", gr.Audio(label="Output", streaming=True, autoplay=True)],
    live=True,
)

demo.launch()
# from transformers import pipeline
# import torch

# device = "cuda:0" if torch.cuda.is_available() else "cpu"

# classifier = pipeline(
#     "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
# )

# from transformers.pipelines.audio_utils import ffmpeg_microphone_live


# def launch_fn(
#     wake_word="marvin",
#     prob_threshold=0.5,
#     chunk_length_s=2.0,
#     stream_chunk_s=0.25,
#     debug=False,
# ):
#     if wake_word not in classifier.model.config.label2id.keys():
#         raise ValueError(
#             f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
#         )

#     sampling_rate = classifier.feature_extractor.sampling_rate

#     mic = ffmpeg_microphone_live(
#         sampling_rate=sampling_rate,
#         chunk_length_s=chunk_length_s,
#         stream_chunk_s=stream_chunk_s,
#     )

#     print("Listening for wake word...")
#     for prediction in classifier(mic):
#         prediction = prediction[0]
#         if debug:
#             print(prediction)
#         if prediction["label"] == wake_word:
#             if prediction["score"] > prob_threshold:
#                 return True

# transcriber = pipeline(
#     "automatic-speech-recognition", model="openai/whisper-base.en", device=device
# )
# import sys


# def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
#     sampling_rate = transcriber.feature_extractor.sampling_rate

#     mic = ffmpeg_microphone_live(
#         sampling_rate=sampling_rate,
#         chunk_length_s=chunk_length_s,
#         stream_chunk_s=stream_chunk_s,
#     )

#     print("Start speaking...")
#     for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
#         sys.stdout.write("\033[K")
#         print(item["text"], end="\r")
#         if not item["partial"][0]:
#             break

#     return item["text"]

# from huggingface_hub import HfFolder
# import requests


# def query(text, model_id="tiiuae/falcon-7b-instruct"):
#     api_url = f"https://api-inference.huggingface.co/models/{model_id}"
#     headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
#     payload = {"inputs": text}

#     print(f"Querying...: {text}")
#     response = requests.post(api_url, headers=headers, json=payload)
#     return response.json()[0]["generated_text"][len(text) + 1 :]

# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

# from datasets import load_dataset

# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# def synthesise(text):
#     inputs = processor(text=text, return_tensors="pt")
#     speech = model.generate_speech(
#         inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
#     )
#     return speech.cpu()


# if __name__ == "__main__":
#     launch_fn(debug=True)
#     # transcription = transcribe()
#     # response = query(transcription)
#     # audio = synthesise(response)
    
#     # Audio(audio, rate=16000, autoplay=True)