Spaces:
Sleeping
Sleeping
File size: 4,100 Bytes
f7f39bd 3200ea6 f7f39bd 715b732 f7f39bd 715b732 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 3200ea6 f7f39bd 96ff1c2 f7f39bd 3200ea6 f7f39bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import gradio as gr
from transformers import pipeline
import numpy as np
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
def transcribe(stream, new_chunk):
sr, y = new_chunk
y = y.astype(np.float32)
y /= np.max(np.abs(y))
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"] , new_chunk
demo = gr.Interface(
transcribe,
["state", gr.Audio(sources=["microphone"], streaming=True)],
["state", "text", gr.Audio(label="Output", streaming=True, autoplay=True)],
live=True,
)
demo.launch()
# from transformers import pipeline
# import torch
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# classifier = pipeline(
# "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
# )
# from transformers.pipelines.audio_utils import ffmpeg_microphone_live
# def launch_fn(
# wake_word="marvin",
# prob_threshold=0.5,
# chunk_length_s=2.0,
# stream_chunk_s=0.25,
# debug=False,
# ):
# if wake_word not in classifier.model.config.label2id.keys():
# raise ValueError(
# f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
# )
# sampling_rate = classifier.feature_extractor.sampling_rate
# mic = ffmpeg_microphone_live(
# sampling_rate=sampling_rate,
# chunk_length_s=chunk_length_s,
# stream_chunk_s=stream_chunk_s,
# )
# print("Listening for wake word...")
# for prediction in classifier(mic):
# prediction = prediction[0]
# if debug:
# print(prediction)
# if prediction["label"] == wake_word:
# if prediction["score"] > prob_threshold:
# return True
# transcriber = pipeline(
# "automatic-speech-recognition", model="openai/whisper-base.en", device=device
# )
# import sys
# def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
# sampling_rate = transcriber.feature_extractor.sampling_rate
# mic = ffmpeg_microphone_live(
# sampling_rate=sampling_rate,
# chunk_length_s=chunk_length_s,
# stream_chunk_s=stream_chunk_s,
# )
# print("Start speaking...")
# for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
# sys.stdout.write("\033[K")
# print(item["text"], end="\r")
# if not item["partial"][0]:
# break
# return item["text"]
# from huggingface_hub import HfFolder
# import requests
# def query(text, model_id="tiiuae/falcon-7b-instruct"):
# api_url = f"https://api-inference.huggingface.co/models/{model_id}"
# headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
# payload = {"inputs": text}
# print(f"Querying...: {text}")
# response = requests.post(api_url, headers=headers, json=payload)
# return response.json()[0]["generated_text"][len(text) + 1 :]
# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
# from datasets import load_dataset
# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# def synthesise(text):
# inputs = processor(text=text, return_tensors="pt")
# speech = model.generate_speech(
# inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
# )
# return speech.cpu()
# if __name__ == "__main__":
# launch_fn(debug=True)
# # transcription = transcribe()
# # response = query(transcription)
# # audio = synthesise(response)
# # Audio(audio, rate=16000, autoplay=True) |